diff --git a/sys/crypto/openssl/amd64/aes-gcm-avx512.S b/sys/crypto/openssl/amd64/aes-gcm-avx512.S new file mode 100644 index 000000000000..6ddd1f994704 --- /dev/null +++ b/sys/crypto/openssl/amd64/aes-gcm-avx512.S @@ -0,0 +1,136132 @@ +/* $FreeBSD$ */ +/* Do not modify. This file is auto-generated from aes-gcm-avx512.pl. */ +.globl ossl_vaes_vpclmulqdq_capable +.type ossl_vaes_vpclmulqdq_capable,@function +.align 32 +ossl_vaes_vpclmulqdq_capable: + movq OPENSSL_ia32cap_P+8(%rip),%rcx + + movq $6600291188736,%rdx + xorl %eax,%eax + andq %rdx,%rcx + cmpq %rdx,%rcx + cmoveq %rcx,%rax + .byte 0xf3,0xc3 +.size ossl_vaes_vpclmulqdq_capable, .-ossl_vaes_vpclmulqdq_capable +.text +.globl ossl_aes_gcm_init_avx512 +.type ossl_aes_gcm_init_avx512,@function +.align 32 +ossl_aes_gcm_init_avx512: +.cfi_startproc +.byte 243,15,30,250 + vpxorq %xmm16,%xmm16,%xmm16 + + + movl 240(%rdi),%eax + cmpl $9,%eax + je .Laes_128_duiuljAybFADyhe + cmpl $11,%eax + je .Laes_192_duiuljAybFADyhe + cmpl $13,%eax + je .Laes_256_duiuljAybFADyhe + jmp .Lexit_aes_duiuljAybFADyhe +.align 32 +.Laes_128_duiuljAybFADyhe: + vpxorq 0(%rdi),%xmm16,%xmm16 + + vaesenc 16(%rdi),%xmm16,%xmm16 + + vaesenc 32(%rdi),%xmm16,%xmm16 + + vaesenc 48(%rdi),%xmm16,%xmm16 + + vaesenc 64(%rdi),%xmm16,%xmm16 + + vaesenc 80(%rdi),%xmm16,%xmm16 + + vaesenc 96(%rdi),%xmm16,%xmm16 + + vaesenc 112(%rdi),%xmm16,%xmm16 + + vaesenc 128(%rdi),%xmm16,%xmm16 + + vaesenc 144(%rdi),%xmm16,%xmm16 + + vaesenclast 160(%rdi),%xmm16,%xmm16 + jmp .Lexit_aes_duiuljAybFADyhe +.align 32 +.Laes_192_duiuljAybFADyhe: + vpxorq 0(%rdi),%xmm16,%xmm16 + + vaesenc 16(%rdi),%xmm16,%xmm16 + + vaesenc 32(%rdi),%xmm16,%xmm16 + + vaesenc 48(%rdi),%xmm16,%xmm16 + + vaesenc 64(%rdi),%xmm16,%xmm16 + + vaesenc 80(%rdi),%xmm16,%xmm16 + + vaesenc 96(%rdi),%xmm16,%xmm16 + + vaesenc 112(%rdi),%xmm16,%xmm16 + + vaesenc 128(%rdi),%xmm16,%xmm16 + + vaesenc 144(%rdi),%xmm16,%xmm16 + + vaesenc 160(%rdi),%xmm16,%xmm16 + + vaesenc 176(%rdi),%xmm16,%xmm16 + + vaesenclast 192(%rdi),%xmm16,%xmm16 + jmp .Lexit_aes_duiuljAybFADyhe +.align 32 +.Laes_256_duiuljAybFADyhe: + vpxorq 0(%rdi),%xmm16,%xmm16 + + vaesenc 16(%rdi),%xmm16,%xmm16 + + vaesenc 32(%rdi),%xmm16,%xmm16 + + vaesenc 48(%rdi),%xmm16,%xmm16 + + vaesenc 64(%rdi),%xmm16,%xmm16 + + vaesenc 80(%rdi),%xmm16,%xmm16 + + vaesenc 96(%rdi),%xmm16,%xmm16 + + vaesenc 112(%rdi),%xmm16,%xmm16 + + vaesenc 128(%rdi),%xmm16,%xmm16 + + vaesenc 144(%rdi),%xmm16,%xmm16 + + vaesenc 160(%rdi),%xmm16,%xmm16 + + vaesenc 176(%rdi),%xmm16,%xmm16 + + vaesenc 192(%rdi),%xmm16,%xmm16 + + vaesenc 208(%rdi),%xmm16,%xmm16 + + vaesenclast 224(%rdi),%xmm16,%xmm16 + jmp .Lexit_aes_duiuljAybFADyhe +.Lexit_aes_duiuljAybFADyhe: + + vpshufb SHUF_MASK(%rip),%xmm16,%xmm16 + + vmovdqa64 %xmm16,%xmm2 + vpsllq $1,%xmm16,%xmm16 + vpsrlq $63,%xmm2,%xmm2 + vmovdqa %xmm2,%xmm1 + vpslldq $8,%xmm2,%xmm2 + vpsrldq $8,%xmm1,%xmm1 + vporq %xmm2,%xmm16,%xmm16 + + vpshufd $36,%xmm1,%xmm2 + vpcmpeqd TWOONE(%rip),%xmm2,%xmm2 + vpand POLY(%rip),%xmm2,%xmm2 + vpxorq %xmm2,%xmm16,%xmm16 + + vmovdqu64 %xmm16,336(%rsi) + vshufi32x4 $0x00,%ymm16,%ymm16,%ymm4 + vmovdqa %ymm4,%ymm3 + + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm0 + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm1 + vpclmulqdq $0x01,%ymm4,%ymm3,%ymm2 + vpclmulqdq $0x10,%ymm4,%ymm3,%ymm3 + vpxorq %ymm2,%ymm3,%ymm3 + + vpsrldq $8,%ymm3,%ymm2 + vpslldq $8,%ymm3,%ymm3 + vpxorq %ymm2,%ymm0,%ymm0 + vpxorq %ymm1,%ymm3,%ymm3 + + + + vmovdqu64 POLY2(%rip),%ymm2 + + vpclmulqdq $0x01,%ymm3,%ymm2,%ymm1 + vpslldq $8,%ymm1,%ymm1 + vpxorq %ymm1,%ymm3,%ymm3 + + + + vpclmulqdq $0x00,%ymm3,%ymm2,%ymm1 + vpsrldq $4,%ymm1,%ymm1 + vpclmulqdq $0x10,%ymm3,%ymm2,%ymm3 + vpslldq $4,%ymm3,%ymm3 + + vpternlogq $0x96,%ymm1,%ymm0,%ymm3 + + vmovdqu64 %xmm3,320(%rsi) + vinserti64x2 $1,%xmm16,%ymm3,%ymm4 + vmovdqa64 %ymm4,%ymm5 + + vpclmulqdq $0x11,%ymm3,%ymm4,%ymm0 + vpclmulqdq $0x00,%ymm3,%ymm4,%ymm1 + vpclmulqdq $0x01,%ymm3,%ymm4,%ymm2 + vpclmulqdq $0x10,%ymm3,%ymm4,%ymm4 + vpxorq %ymm2,%ymm4,%ymm4 + + vpsrldq $8,%ymm4,%ymm2 + vpslldq $8,%ymm4,%ymm4 + vpxorq %ymm2,%ymm0,%ymm0 + vpxorq %ymm1,%ymm4,%ymm4 + + + + vmovdqu64 POLY2(%rip),%ymm2 + + vpclmulqdq $0x01,%ymm4,%ymm2,%ymm1 + vpslldq $8,%ymm1,%ymm1 + vpxorq %ymm1,%ymm4,%ymm4 + + + + vpclmulqdq $0x00,%ymm4,%ymm2,%ymm1 + vpsrldq $4,%ymm1,%ymm1 + vpclmulqdq $0x10,%ymm4,%ymm2,%ymm4 + vpslldq $4,%ymm4,%ymm4 + + vpternlogq $0x96,%ymm1,%ymm0,%ymm4 + + vmovdqu64 %ymm4,288(%rsi) + + vinserti64x4 $1,%ymm5,%zmm4,%zmm4 + + + vshufi64x2 $0x00,%zmm4,%zmm4,%zmm3 + vmovdqa64 %zmm4,%zmm5 + + vpclmulqdq $0x11,%zmm3,%zmm4,%zmm0 + vpclmulqdq $0x00,%zmm3,%zmm4,%zmm1 + vpclmulqdq $0x01,%zmm3,%zmm4,%zmm2 + vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4 + vpxorq %zmm2,%zmm4,%zmm4 + + vpsrldq $8,%zmm4,%zmm2 + vpslldq $8,%zmm4,%zmm4 + vpxorq %zmm2,%zmm0,%zmm0 + vpxorq %zmm1,%zmm4,%zmm4 + + + + vmovdqu64 POLY2(%rip),%zmm2 + + vpclmulqdq $0x01,%zmm4,%zmm2,%zmm1 + vpslldq $8,%zmm1,%zmm1 + vpxorq %zmm1,%zmm4,%zmm4 + + + + vpclmulqdq $0x00,%zmm4,%zmm2,%zmm1 + vpsrldq $4,%zmm1,%zmm1 + vpclmulqdq $0x10,%zmm4,%zmm2,%zmm4 + vpslldq $4,%zmm4,%zmm4 + + vpternlogq $0x96,%zmm1,%zmm0,%zmm4 + + vmovdqu64 %zmm4,224(%rsi) + vshufi64x2 $0x00,%zmm4,%zmm4,%zmm3 + + vpclmulqdq $0x11,%zmm3,%zmm5,%zmm0 + vpclmulqdq $0x00,%zmm3,%zmm5,%zmm1 + vpclmulqdq $0x01,%zmm3,%zmm5,%zmm2 + vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5 + vpxorq %zmm2,%zmm5,%zmm5 + + vpsrldq $8,%zmm5,%zmm2 + vpslldq $8,%zmm5,%zmm5 + vpxorq %zmm2,%zmm0,%zmm0 + vpxorq %zmm1,%zmm5,%zmm5 + + + + vmovdqu64 POLY2(%rip),%zmm2 + + vpclmulqdq $0x01,%zmm5,%zmm2,%zmm1 + vpslldq $8,%zmm1,%zmm1 + vpxorq %zmm1,%zmm5,%zmm5 + + + + vpclmulqdq $0x00,%zmm5,%zmm2,%zmm1 + vpsrldq $4,%zmm1,%zmm1 + vpclmulqdq $0x10,%zmm5,%zmm2,%zmm5 + vpslldq $4,%zmm5,%zmm5 + + vpternlogq $0x96,%zmm1,%zmm0,%zmm5 + + vmovdqu64 %zmm5,160(%rsi) + + vpclmulqdq $0x11,%zmm3,%zmm4,%zmm0 + vpclmulqdq $0x00,%zmm3,%zmm4,%zmm1 + vpclmulqdq $0x01,%zmm3,%zmm4,%zmm2 + vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4 + vpxorq %zmm2,%zmm4,%zmm4 + + vpsrldq $8,%zmm4,%zmm2 + vpslldq $8,%zmm4,%zmm4 + vpxorq %zmm2,%zmm0,%zmm0 + vpxorq %zmm1,%zmm4,%zmm4 + + + + vmovdqu64 POLY2(%rip),%zmm2 + + vpclmulqdq $0x01,%zmm4,%zmm2,%zmm1 + vpslldq $8,%zmm1,%zmm1 + vpxorq %zmm1,%zmm4,%zmm4 + + + + vpclmulqdq $0x00,%zmm4,%zmm2,%zmm1 + vpsrldq $4,%zmm1,%zmm1 + vpclmulqdq $0x10,%zmm4,%zmm2,%zmm4 + vpslldq $4,%zmm4,%zmm4 + + vpternlogq $0x96,%zmm1,%zmm0,%zmm4 + + vmovdqu64 %zmm4,96(%rsi) + vzeroupper +.Labort_init: + .byte 0xf3,0xc3 +.cfi_endproc +.size ossl_aes_gcm_init_avx512, .-ossl_aes_gcm_init_avx512 +.globl ossl_aes_gcm_setiv_avx512 +.type ossl_aes_gcm_setiv_avx512,@function +.align 32 +ossl_aes_gcm_setiv_avx512: +.cfi_startproc +.Lsetiv_seh_begin: +.byte 243,15,30,250 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 +.Lsetiv_seh_push_rbx: + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 +.Lsetiv_seh_push_rbp: + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 +.Lsetiv_seh_push_r12: + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 +.Lsetiv_seh_push_r13: + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 +.Lsetiv_seh_push_r14: + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 +.Lsetiv_seh_push_r15: + + + + + + + + + + + leaq 0(%rsp),%rbp +.cfi_def_cfa_register %rbp +.Lsetiv_seh_setfp: + +.Lsetiv_seh_prolog_end: + subq $820,%rsp + andq $(-64),%rsp + cmpq $12,%rcx + je iv_len_12_init_IV + vpxor %xmm2,%xmm2,%xmm2 + movq %rdx,%r10 + movq %rcx,%r11 + orq %r11,%r11 + jz .L_CALC_AAD_done_mBgdvxqgFGebeug + + xorq %rbx,%rbx + vmovdqa64 SHUF_MASK(%rip),%zmm16 + +.L_get_AAD_loop48x16_mBgdvxqgFGebeug: + cmpq $768,%r11 + jl .L_exit_AAD_loop48x16_mBgdvxqgFGebeug + vmovdqu64 0(%r10),%zmm11 + vmovdqu64 64(%r10),%zmm3 + vmovdqu64 128(%r10),%zmm4 + vmovdqu64 192(%r10),%zmm5 + vpshufb %zmm16,%zmm11,%zmm11 + vpshufb %zmm16,%zmm3,%zmm3 + vpshufb %zmm16,%zmm4,%zmm4 + vpshufb %zmm16,%zmm5,%zmm5 + testq %rbx,%rbx + jnz .L_skip_hkeys_precomputation_EzsAegbBbaerfwt + + vmovdqu64 288(%rsi),%zmm1 + vmovdqu64 %zmm1,704(%rsp) + + vmovdqu64 224(%rsi),%zmm9 + vmovdqu64 %zmm9,640(%rsp) + + + vshufi64x2 $0x00,%zmm9,%zmm9,%zmm9 + + vmovdqu64 160(%rsi),%zmm10 + vmovdqu64 %zmm10,576(%rsp) + + vmovdqu64 96(%rsi),%zmm12 + vmovdqu64 %zmm12,512(%rsp) + + vpclmulqdq $0x11,%zmm9,%zmm10,%zmm13 + vpclmulqdq $0x00,%zmm9,%zmm10,%zmm15 + vpclmulqdq $0x01,%zmm9,%zmm10,%zmm17 + vpclmulqdq $0x10,%zmm9,%zmm10,%zmm10 + vpxorq %zmm17,%zmm10,%zmm10 + + vpsrldq $8,%zmm10,%zmm17 + vpslldq $8,%zmm10,%zmm10 + vpxorq %zmm17,%zmm13,%zmm13 + vpxorq %zmm15,%zmm10,%zmm10 + + + + vmovdqu64 POLY2(%rip),%zmm17 + + vpclmulqdq $0x01,%zmm10,%zmm17,%zmm15 + vpslldq $8,%zmm15,%zmm15 + vpxorq %zmm15,%zmm10,%zmm10 + + + + vpclmulqdq $0x00,%zmm10,%zmm17,%zmm15 + vpsrldq $4,%zmm15,%zmm15 + vpclmulqdq $0x10,%zmm10,%zmm17,%zmm10 + vpslldq $4,%zmm10,%zmm10 + + vpternlogq $0x96,%zmm15,%zmm13,%zmm10 + + vmovdqu64 %zmm10,448(%rsp) + + vpclmulqdq $0x11,%zmm9,%zmm12,%zmm13 + vpclmulqdq $0x00,%zmm9,%zmm12,%zmm15 + vpclmulqdq $0x01,%zmm9,%zmm12,%zmm17 + vpclmulqdq $0x10,%zmm9,%zmm12,%zmm12 + vpxorq %zmm17,%zmm12,%zmm12 + + vpsrldq $8,%zmm12,%zmm17 + vpslldq $8,%zmm12,%zmm12 + vpxorq %zmm17,%zmm13,%zmm13 + vpxorq %zmm15,%zmm12,%zmm12 + + + + vmovdqu64 POLY2(%rip),%zmm17 + + vpclmulqdq $0x01,%zmm12,%zmm17,%zmm15 + vpslldq $8,%zmm15,%zmm15 + vpxorq %zmm15,%zmm12,%zmm12 + + + + vpclmulqdq $0x00,%zmm12,%zmm17,%zmm15 + vpsrldq $4,%zmm15,%zmm15 + vpclmulqdq $0x10,%zmm12,%zmm17,%zmm12 + vpslldq $4,%zmm12,%zmm12 + + vpternlogq $0x96,%zmm15,%zmm13,%zmm12 + + vmovdqu64 %zmm12,384(%rsp) + + vpclmulqdq $0x11,%zmm9,%zmm10,%zmm13 + vpclmulqdq $0x00,%zmm9,%zmm10,%zmm15 + vpclmulqdq $0x01,%zmm9,%zmm10,%zmm17 + vpclmulqdq $0x10,%zmm9,%zmm10,%zmm10 + vpxorq %zmm17,%zmm10,%zmm10 + + vpsrldq $8,%zmm10,%zmm17 + vpslldq $8,%zmm10,%zmm10 + vpxorq %zmm17,%zmm13,%zmm13 + vpxorq %zmm15,%zmm10,%zmm10 + + + + vmovdqu64 POLY2(%rip),%zmm17 + + vpclmulqdq $0x01,%zmm10,%zmm17,%zmm15 + vpslldq $8,%zmm15,%zmm15 + vpxorq %zmm15,%zmm10,%zmm10 + + + + vpclmulqdq $0x00,%zmm10,%zmm17,%zmm15 + vpsrldq $4,%zmm15,%zmm15 + vpclmulqdq $0x10,%zmm10,%zmm17,%zmm10 + vpslldq $4,%zmm10,%zmm10 + + vpternlogq $0x96,%zmm15,%zmm13,%zmm10 + + vmovdqu64 %zmm10,320(%rsp) + + vpclmulqdq $0x11,%zmm9,%zmm12,%zmm13 + vpclmulqdq $0x00,%zmm9,%zmm12,%zmm15 + vpclmulqdq $0x01,%zmm9,%zmm12,%zmm17 + vpclmulqdq $0x10,%zmm9,%zmm12,%zmm12 + vpxorq %zmm17,%zmm12,%zmm12 + + vpsrldq $8,%zmm12,%zmm17 + vpslldq $8,%zmm12,%zmm12 + vpxorq %zmm17,%zmm13,%zmm13 + vpxorq %zmm15,%zmm12,%zmm12 + + + + vmovdqu64 POLY2(%rip),%zmm17 + + vpclmulqdq $0x01,%zmm12,%zmm17,%zmm15 + vpslldq $8,%zmm15,%zmm15 + vpxorq %zmm15,%zmm12,%zmm12 + + + + vpclmulqdq $0x00,%zmm12,%zmm17,%zmm15 + vpsrldq $4,%zmm15,%zmm15 + vpclmulqdq $0x10,%zmm12,%zmm17,%zmm12 + vpslldq $4,%zmm12,%zmm12 + + vpternlogq $0x96,%zmm15,%zmm13,%zmm12 + + vmovdqu64 %zmm12,256(%rsp) + + vpclmulqdq $0x11,%zmm9,%zmm10,%zmm13 + vpclmulqdq $0x00,%zmm9,%zmm10,%zmm15 + vpclmulqdq $0x01,%zmm9,%zmm10,%zmm17 + vpclmulqdq $0x10,%zmm9,%zmm10,%zmm10 + vpxorq %zmm17,%zmm10,%zmm10 + + vpsrldq $8,%zmm10,%zmm17 + vpslldq $8,%zmm10,%zmm10 + vpxorq %zmm17,%zmm13,%zmm13 + vpxorq %zmm15,%zmm10,%zmm10 + + + + vmovdqu64 POLY2(%rip),%zmm17 + + vpclmulqdq $0x01,%zmm10,%zmm17,%zmm15 + vpslldq $8,%zmm15,%zmm15 + vpxorq %zmm15,%zmm10,%zmm10 + + + + vpclmulqdq $0x00,%zmm10,%zmm17,%zmm15 + vpsrldq $4,%zmm15,%zmm15 + vpclmulqdq $0x10,%zmm10,%zmm17,%zmm10 + vpslldq $4,%zmm10,%zmm10 + + vpternlogq $0x96,%zmm15,%zmm13,%zmm10 + + vmovdqu64 %zmm10,192(%rsp) + + vpclmulqdq $0x11,%zmm9,%zmm12,%zmm13 + vpclmulqdq $0x00,%zmm9,%zmm12,%zmm15 + vpclmulqdq $0x01,%zmm9,%zmm12,%zmm17 + vpclmulqdq $0x10,%zmm9,%zmm12,%zmm12 + vpxorq %zmm17,%zmm12,%zmm12 + + vpsrldq $8,%zmm12,%zmm17 + vpslldq $8,%zmm12,%zmm12 + vpxorq %zmm17,%zmm13,%zmm13 + vpxorq %zmm15,%zmm12,%zmm12 + + + + vmovdqu64 POLY2(%rip),%zmm17 + + vpclmulqdq $0x01,%zmm12,%zmm17,%zmm15 + vpslldq $8,%zmm15,%zmm15 + vpxorq %zmm15,%zmm12,%zmm12 + + + + vpclmulqdq $0x00,%zmm12,%zmm17,%zmm15 + vpsrldq $4,%zmm15,%zmm15 + vpclmulqdq $0x10,%zmm12,%zmm17,%zmm12 + vpslldq $4,%zmm12,%zmm12 + + vpternlogq $0x96,%zmm15,%zmm13,%zmm12 + + vmovdqu64 %zmm12,128(%rsp) + + vpclmulqdq $0x11,%zmm9,%zmm10,%zmm13 + vpclmulqdq $0x00,%zmm9,%zmm10,%zmm15 + vpclmulqdq $0x01,%zmm9,%zmm10,%zmm17 + vpclmulqdq $0x10,%zmm9,%zmm10,%zmm10 + vpxorq %zmm17,%zmm10,%zmm10 + + vpsrldq $8,%zmm10,%zmm17 + vpslldq $8,%zmm10,%zmm10 + vpxorq %zmm17,%zmm13,%zmm13 + vpxorq %zmm15,%zmm10,%zmm10 + + + + vmovdqu64 POLY2(%rip),%zmm17 + + vpclmulqdq $0x01,%zmm10,%zmm17,%zmm15 + vpslldq $8,%zmm15,%zmm15 + vpxorq %zmm15,%zmm10,%zmm10 + + + + vpclmulqdq $0x00,%zmm10,%zmm17,%zmm15 + vpsrldq $4,%zmm15,%zmm15 + vpclmulqdq $0x10,%zmm10,%zmm17,%zmm10 + vpslldq $4,%zmm10,%zmm10 + + vpternlogq $0x96,%zmm15,%zmm13,%zmm10 + + vmovdqu64 %zmm10,64(%rsp) + + vpclmulqdq $0x11,%zmm9,%zmm12,%zmm13 + vpclmulqdq $0x00,%zmm9,%zmm12,%zmm15 + vpclmulqdq $0x01,%zmm9,%zmm12,%zmm17 + vpclmulqdq $0x10,%zmm9,%zmm12,%zmm12 + vpxorq %zmm17,%zmm12,%zmm12 + + vpsrldq $8,%zmm12,%zmm17 + vpslldq $8,%zmm12,%zmm12 + vpxorq %zmm17,%zmm13,%zmm13 + vpxorq %zmm15,%zmm12,%zmm12 + + + + vmovdqu64 POLY2(%rip),%zmm17 + + vpclmulqdq $0x01,%zmm12,%zmm17,%zmm15 + vpslldq $8,%zmm15,%zmm15 + vpxorq %zmm15,%zmm12,%zmm12 + + + + vpclmulqdq $0x00,%zmm12,%zmm17,%zmm15 + vpsrldq $4,%zmm15,%zmm15 + vpclmulqdq $0x10,%zmm12,%zmm17,%zmm12 + vpslldq $4,%zmm12,%zmm12 + + vpternlogq $0x96,%zmm15,%zmm13,%zmm12 + + vmovdqu64 %zmm12,0(%rsp) +.L_skip_hkeys_precomputation_EzsAegbBbaerfwt: + movq $1,%rbx + vpxorq %zmm2,%zmm11,%zmm11 + vmovdqu64 0(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm11,%zmm1 + vpclmulqdq $0x00,%zmm19,%zmm11,%zmm9 + vpclmulqdq $0x01,%zmm19,%zmm11,%zmm10 + vpclmulqdq $0x10,%zmm19,%zmm11,%zmm12 + vmovdqu64 64(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm3,%zmm13 + vpclmulqdq $0x00,%zmm19,%zmm3,%zmm15 + vpclmulqdq $0x01,%zmm19,%zmm3,%zmm17 + vpclmulqdq $0x10,%zmm19,%zmm3,%zmm18 + vpxorq %zmm17,%zmm10,%zmm7 + vpxorq %zmm13,%zmm1,%zmm6 + vpxorq %zmm15,%zmm9,%zmm8 + vpternlogq $0x96,%zmm18,%zmm12,%zmm7 + vmovdqu64 128(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm4,%zmm1 + vpclmulqdq $0x00,%zmm19,%zmm4,%zmm9 + vpclmulqdq $0x01,%zmm19,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm19,%zmm4,%zmm12 + vmovdqu64 192(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm5,%zmm13 + vpclmulqdq $0x00,%zmm19,%zmm5,%zmm15 + vpclmulqdq $0x01,%zmm19,%zmm5,%zmm17 + vpclmulqdq $0x10,%zmm19,%zmm5,%zmm18 + + vpternlogq $0x96,%zmm17,%zmm10,%zmm7 + vpternlogq $0x96,%zmm13,%zmm1,%zmm6 + vpternlogq $0x96,%zmm15,%zmm9,%zmm8 + vpternlogq $0x96,%zmm18,%zmm12,%zmm7 + vmovdqu64 256(%r10),%zmm11 + vmovdqu64 320(%r10),%zmm3 + vmovdqu64 384(%r10),%zmm4 + vmovdqu64 448(%r10),%zmm5 + vpshufb %zmm16,%zmm11,%zmm11 + vpshufb %zmm16,%zmm3,%zmm3 + vpshufb %zmm16,%zmm4,%zmm4 + vpshufb %zmm16,%zmm5,%zmm5 + vmovdqu64 256(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm11,%zmm1 + vpclmulqdq $0x00,%zmm19,%zmm11,%zmm9 + vpclmulqdq $0x01,%zmm19,%zmm11,%zmm10 + vpclmulqdq $0x10,%zmm19,%zmm11,%zmm12 + vmovdqu64 320(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm3,%zmm13 + vpclmulqdq $0x00,%zmm19,%zmm3,%zmm15 + vpclmulqdq $0x01,%zmm19,%zmm3,%zmm17 + vpclmulqdq $0x10,%zmm19,%zmm3,%zmm18 + vpternlogq $0x96,%zmm17,%zmm10,%zmm7 + vpternlogq $0x96,%zmm13,%zmm1,%zmm6 + vpternlogq $0x96,%zmm15,%zmm9,%zmm8 + vpternlogq $0x96,%zmm18,%zmm12,%zmm7 + vmovdqu64 384(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm4,%zmm1 + vpclmulqdq $0x00,%zmm19,%zmm4,%zmm9 + vpclmulqdq $0x01,%zmm19,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm19,%zmm4,%zmm12 + vmovdqu64 448(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm5,%zmm13 + vpclmulqdq $0x00,%zmm19,%zmm5,%zmm15 + vpclmulqdq $0x01,%zmm19,%zmm5,%zmm17 + vpclmulqdq $0x10,%zmm19,%zmm5,%zmm18 + + vpternlogq $0x96,%zmm17,%zmm10,%zmm7 + vpternlogq $0x96,%zmm13,%zmm1,%zmm6 + vpternlogq $0x96,%zmm15,%zmm9,%zmm8 + vpternlogq $0x96,%zmm18,%zmm12,%zmm7 + vmovdqu64 512(%r10),%zmm11 + vmovdqu64 576(%r10),%zmm3 + vmovdqu64 640(%r10),%zmm4 + vmovdqu64 704(%r10),%zmm5 + vpshufb %zmm16,%zmm11,%zmm11 + vpshufb %zmm16,%zmm3,%zmm3 + vpshufb %zmm16,%zmm4,%zmm4 + vpshufb %zmm16,%zmm5,%zmm5 + vmovdqu64 512(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm11,%zmm1 + vpclmulqdq $0x00,%zmm19,%zmm11,%zmm9 + vpclmulqdq $0x01,%zmm19,%zmm11,%zmm10 + vpclmulqdq $0x10,%zmm19,%zmm11,%zmm12 + vmovdqu64 576(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm3,%zmm13 + vpclmulqdq $0x00,%zmm19,%zmm3,%zmm15 + vpclmulqdq $0x01,%zmm19,%zmm3,%zmm17 + vpclmulqdq $0x10,%zmm19,%zmm3,%zmm18 + vpternlogq $0x96,%zmm17,%zmm10,%zmm7 + vpternlogq $0x96,%zmm13,%zmm1,%zmm6 + vpternlogq $0x96,%zmm15,%zmm9,%zmm8 + vpternlogq $0x96,%zmm18,%zmm12,%zmm7 + vmovdqu64 640(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm4,%zmm1 + vpclmulqdq $0x00,%zmm19,%zmm4,%zmm9 + vpclmulqdq $0x01,%zmm19,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm19,%zmm4,%zmm12 + vmovdqu64 704(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm5,%zmm13 + vpclmulqdq $0x00,%zmm19,%zmm5,%zmm15 + vpclmulqdq $0x01,%zmm19,%zmm5,%zmm17 + vpclmulqdq $0x10,%zmm19,%zmm5,%zmm18 + + vpternlogq $0x96,%zmm17,%zmm10,%zmm7 + vpternlogq $0x96,%zmm13,%zmm1,%zmm6 + vpternlogq $0x96,%zmm15,%zmm9,%zmm8 + vpternlogq $0x96,%zmm18,%zmm12,%zmm7 + + vpsrldq $8,%zmm7,%zmm1 + vpslldq $8,%zmm7,%zmm9 + vpxorq %zmm1,%zmm6,%zmm6 + vpxorq %zmm9,%zmm8,%zmm8 + vextracti64x4 $1,%zmm6,%ymm1 + vpxorq %ymm1,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm1 + vpxorq %xmm1,%xmm6,%xmm6 + vextracti64x4 $1,%zmm8,%ymm9 + vpxorq %ymm9,%ymm8,%ymm8 + vextracti32x4 $1,%ymm8,%xmm9 + vpxorq %xmm9,%xmm8,%xmm8 + vmovdqa64 POLY2(%rip),%xmm10 + + + vpclmulqdq $0x01,%xmm8,%xmm10,%xmm1 + vpslldq $8,%xmm1,%xmm1 + vpxorq %xmm1,%xmm8,%xmm1 + + + vpclmulqdq $0x00,%xmm1,%xmm10,%xmm9 + vpsrldq $4,%xmm9,%xmm9 + vpclmulqdq $0x10,%xmm1,%xmm10,%xmm2 + vpslldq $4,%xmm2,%xmm2 + vpternlogq $0x96,%xmm6,%xmm9,%xmm2 + + subq $768,%r11 + je .L_CALC_AAD_done_mBgdvxqgFGebeug + + addq $768,%r10 + jmp .L_get_AAD_loop48x16_mBgdvxqgFGebeug + +.L_exit_AAD_loop48x16_mBgdvxqgFGebeug: + + cmpq $512,%r11 + jl .L_less_than_32x16_mBgdvxqgFGebeug + + vmovdqu64 0(%r10),%zmm11 + vmovdqu64 64(%r10),%zmm3 + vmovdqu64 128(%r10),%zmm4 + vmovdqu64 192(%r10),%zmm5 + vpshufb %zmm16,%zmm11,%zmm11 + vpshufb %zmm16,%zmm3,%zmm3 + vpshufb %zmm16,%zmm4,%zmm4 + vpshufb %zmm16,%zmm5,%zmm5 + testq %rbx,%rbx + jnz .L_skip_hkeys_precomputation_xCxmdbgxoCdwefc + + vmovdqu64 288(%rsi),%zmm1 + vmovdqu64 %zmm1,704(%rsp) + + vmovdqu64 224(%rsi),%zmm9 + vmovdqu64 %zmm9,640(%rsp) + + + vshufi64x2 $0x00,%zmm9,%zmm9,%zmm9 + + vmovdqu64 160(%rsi),%zmm10 + vmovdqu64 %zmm10,576(%rsp) + + vmovdqu64 96(%rsi),%zmm12 + vmovdqu64 %zmm12,512(%rsp) + + vpclmulqdq $0x11,%zmm9,%zmm10,%zmm13 + vpclmulqdq $0x00,%zmm9,%zmm10,%zmm15 + vpclmulqdq $0x01,%zmm9,%zmm10,%zmm17 + vpclmulqdq $0x10,%zmm9,%zmm10,%zmm10 + vpxorq %zmm17,%zmm10,%zmm10 + + vpsrldq $8,%zmm10,%zmm17 + vpslldq $8,%zmm10,%zmm10 + vpxorq %zmm17,%zmm13,%zmm13 + vpxorq %zmm15,%zmm10,%zmm10 + + + + vmovdqu64 POLY2(%rip),%zmm17 + + vpclmulqdq $0x01,%zmm10,%zmm17,%zmm15 + vpslldq $8,%zmm15,%zmm15 + vpxorq %zmm15,%zmm10,%zmm10 + + + + vpclmulqdq $0x00,%zmm10,%zmm17,%zmm15 + vpsrldq $4,%zmm15,%zmm15 + vpclmulqdq $0x10,%zmm10,%zmm17,%zmm10 + vpslldq $4,%zmm10,%zmm10 + + vpternlogq $0x96,%zmm15,%zmm13,%zmm10 + + vmovdqu64 %zmm10,448(%rsp) + + vpclmulqdq $0x11,%zmm9,%zmm12,%zmm13 + vpclmulqdq $0x00,%zmm9,%zmm12,%zmm15 + vpclmulqdq $0x01,%zmm9,%zmm12,%zmm17 + vpclmulqdq $0x10,%zmm9,%zmm12,%zmm12 + vpxorq %zmm17,%zmm12,%zmm12 + + vpsrldq $8,%zmm12,%zmm17 + vpslldq $8,%zmm12,%zmm12 + vpxorq %zmm17,%zmm13,%zmm13 + vpxorq %zmm15,%zmm12,%zmm12 + + + + vmovdqu64 POLY2(%rip),%zmm17 + + vpclmulqdq $0x01,%zmm12,%zmm17,%zmm15 + vpslldq $8,%zmm15,%zmm15 + vpxorq %zmm15,%zmm12,%zmm12 + + + + vpclmulqdq $0x00,%zmm12,%zmm17,%zmm15 + vpsrldq $4,%zmm15,%zmm15 + vpclmulqdq $0x10,%zmm12,%zmm17,%zmm12 + vpslldq $4,%zmm12,%zmm12 + + vpternlogq $0x96,%zmm15,%zmm13,%zmm12 + + vmovdqu64 %zmm12,384(%rsp) + + vpclmulqdq $0x11,%zmm9,%zmm10,%zmm13 + vpclmulqdq $0x00,%zmm9,%zmm10,%zmm15 + vpclmulqdq $0x01,%zmm9,%zmm10,%zmm17 + vpclmulqdq $0x10,%zmm9,%zmm10,%zmm10 + vpxorq %zmm17,%zmm10,%zmm10 + + vpsrldq $8,%zmm10,%zmm17 + vpslldq $8,%zmm10,%zmm10 + vpxorq %zmm17,%zmm13,%zmm13 + vpxorq %zmm15,%zmm10,%zmm10 + + + + vmovdqu64 POLY2(%rip),%zmm17 + + vpclmulqdq $0x01,%zmm10,%zmm17,%zmm15 + vpslldq $8,%zmm15,%zmm15 + vpxorq %zmm15,%zmm10,%zmm10 + + + + vpclmulqdq $0x00,%zmm10,%zmm17,%zmm15 + vpsrldq $4,%zmm15,%zmm15 + vpclmulqdq $0x10,%zmm10,%zmm17,%zmm10 + vpslldq $4,%zmm10,%zmm10 + + vpternlogq $0x96,%zmm15,%zmm13,%zmm10 + + vmovdqu64 %zmm10,320(%rsp) + + vpclmulqdq $0x11,%zmm9,%zmm12,%zmm13 + vpclmulqdq $0x00,%zmm9,%zmm12,%zmm15 + vpclmulqdq $0x01,%zmm9,%zmm12,%zmm17 + vpclmulqdq $0x10,%zmm9,%zmm12,%zmm12 + vpxorq %zmm17,%zmm12,%zmm12 + + vpsrldq $8,%zmm12,%zmm17 + vpslldq $8,%zmm12,%zmm12 + vpxorq %zmm17,%zmm13,%zmm13 + vpxorq %zmm15,%zmm12,%zmm12 + + + + vmovdqu64 POLY2(%rip),%zmm17 + + vpclmulqdq $0x01,%zmm12,%zmm17,%zmm15 + vpslldq $8,%zmm15,%zmm15 + vpxorq %zmm15,%zmm12,%zmm12 + + + + vpclmulqdq $0x00,%zmm12,%zmm17,%zmm15 + vpsrldq $4,%zmm15,%zmm15 + vpclmulqdq $0x10,%zmm12,%zmm17,%zmm12 + vpslldq $4,%zmm12,%zmm12 + + vpternlogq $0x96,%zmm15,%zmm13,%zmm12 + + vmovdqu64 %zmm12,256(%rsp) +.L_skip_hkeys_precomputation_xCxmdbgxoCdwefc: + movq $1,%rbx + vpxorq %zmm2,%zmm11,%zmm11 + vmovdqu64 256(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm11,%zmm1 + vpclmulqdq $0x00,%zmm19,%zmm11,%zmm9 + vpclmulqdq $0x01,%zmm19,%zmm11,%zmm10 + vpclmulqdq $0x10,%zmm19,%zmm11,%zmm12 + vmovdqu64 320(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm3,%zmm13 + vpclmulqdq $0x00,%zmm19,%zmm3,%zmm15 + vpclmulqdq $0x01,%zmm19,%zmm3,%zmm17 + vpclmulqdq $0x10,%zmm19,%zmm3,%zmm18 + vpxorq %zmm17,%zmm10,%zmm7 + vpxorq %zmm13,%zmm1,%zmm6 + vpxorq %zmm15,%zmm9,%zmm8 + vpternlogq $0x96,%zmm18,%zmm12,%zmm7 + vmovdqu64 384(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm4,%zmm1 + vpclmulqdq $0x00,%zmm19,%zmm4,%zmm9 + vpclmulqdq $0x01,%zmm19,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm19,%zmm4,%zmm12 + vmovdqu64 448(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm5,%zmm13 + vpclmulqdq $0x00,%zmm19,%zmm5,%zmm15 + vpclmulqdq $0x01,%zmm19,%zmm5,%zmm17 + vpclmulqdq $0x10,%zmm19,%zmm5,%zmm18 + + vpternlogq $0x96,%zmm17,%zmm10,%zmm7 + vpternlogq $0x96,%zmm13,%zmm1,%zmm6 + vpternlogq $0x96,%zmm15,%zmm9,%zmm8 + vpternlogq $0x96,%zmm18,%zmm12,%zmm7 + vmovdqu64 256(%r10),%zmm11 + vmovdqu64 320(%r10),%zmm3 + vmovdqu64 384(%r10),%zmm4 + vmovdqu64 448(%r10),%zmm5 + vpshufb %zmm16,%zmm11,%zmm11 + vpshufb %zmm16,%zmm3,%zmm3 + vpshufb %zmm16,%zmm4,%zmm4 + vpshufb %zmm16,%zmm5,%zmm5 + vmovdqu64 512(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm11,%zmm1 + vpclmulqdq $0x00,%zmm19,%zmm11,%zmm9 + vpclmulqdq $0x01,%zmm19,%zmm11,%zmm10 + vpclmulqdq $0x10,%zmm19,%zmm11,%zmm12 + vmovdqu64 576(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm3,%zmm13 + vpclmulqdq $0x00,%zmm19,%zmm3,%zmm15 + vpclmulqdq $0x01,%zmm19,%zmm3,%zmm17 + vpclmulqdq $0x10,%zmm19,%zmm3,%zmm18 + vpternlogq $0x96,%zmm17,%zmm10,%zmm7 + vpternlogq $0x96,%zmm13,%zmm1,%zmm6 + vpternlogq $0x96,%zmm15,%zmm9,%zmm8 + vpternlogq $0x96,%zmm18,%zmm12,%zmm7 + vmovdqu64 640(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm4,%zmm1 + vpclmulqdq $0x00,%zmm19,%zmm4,%zmm9 + vpclmulqdq $0x01,%zmm19,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm19,%zmm4,%zmm12 + vmovdqu64 704(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm5,%zmm13 + vpclmulqdq $0x00,%zmm19,%zmm5,%zmm15 + vpclmulqdq $0x01,%zmm19,%zmm5,%zmm17 + vpclmulqdq $0x10,%zmm19,%zmm5,%zmm18 + + vpternlogq $0x96,%zmm17,%zmm10,%zmm7 + vpternlogq $0x96,%zmm13,%zmm1,%zmm6 + vpternlogq $0x96,%zmm15,%zmm9,%zmm8 + vpternlogq $0x96,%zmm18,%zmm12,%zmm7 + + vpsrldq $8,%zmm7,%zmm1 + vpslldq $8,%zmm7,%zmm9 + vpxorq %zmm1,%zmm6,%zmm6 + vpxorq %zmm9,%zmm8,%zmm8 + vextracti64x4 $1,%zmm6,%ymm1 + vpxorq %ymm1,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm1 + vpxorq %xmm1,%xmm6,%xmm6 + vextracti64x4 $1,%zmm8,%ymm9 + vpxorq %ymm9,%ymm8,%ymm8 + vextracti32x4 $1,%ymm8,%xmm9 + vpxorq %xmm9,%xmm8,%xmm8 + vmovdqa64 POLY2(%rip),%xmm10 + + + vpclmulqdq $0x01,%xmm8,%xmm10,%xmm1 + vpslldq $8,%xmm1,%xmm1 + vpxorq %xmm1,%xmm8,%xmm1 + + + vpclmulqdq $0x00,%xmm1,%xmm10,%xmm9 + vpsrldq $4,%xmm9,%xmm9 + vpclmulqdq $0x10,%xmm1,%xmm10,%xmm2 + vpslldq $4,%xmm2,%xmm2 + vpternlogq $0x96,%xmm6,%xmm9,%xmm2 + + subq $512,%r11 + je .L_CALC_AAD_done_mBgdvxqgFGebeug + + addq $512,%r10 + jmp .L_less_than_16x16_mBgdvxqgFGebeug + +.L_less_than_32x16_mBgdvxqgFGebeug: + cmpq $256,%r11 + jl .L_less_than_16x16_mBgdvxqgFGebeug + + vmovdqu64 0(%r10),%zmm11 + vmovdqu64 64(%r10),%zmm3 + vmovdqu64 128(%r10),%zmm4 + vmovdqu64 192(%r10),%zmm5 + vpshufb %zmm16,%zmm11,%zmm11 + vpshufb %zmm16,%zmm3,%zmm3 + vpshufb %zmm16,%zmm4,%zmm4 + vpshufb %zmm16,%zmm5,%zmm5 + vpxorq %zmm2,%zmm11,%zmm11 + vmovdqu64 96(%rsi),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm11,%zmm1 + vpclmulqdq $0x00,%zmm19,%zmm11,%zmm9 + vpclmulqdq $0x01,%zmm19,%zmm11,%zmm10 + vpclmulqdq $0x10,%zmm19,%zmm11,%zmm12 + vmovdqu64 160(%rsi),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm3,%zmm13 + vpclmulqdq $0x00,%zmm19,%zmm3,%zmm15 + vpclmulqdq $0x01,%zmm19,%zmm3,%zmm17 + vpclmulqdq $0x10,%zmm19,%zmm3,%zmm18 + vpxorq %zmm17,%zmm10,%zmm7 + vpxorq %zmm13,%zmm1,%zmm6 + vpxorq %zmm15,%zmm9,%zmm8 + vpternlogq $0x96,%zmm18,%zmm12,%zmm7 + vmovdqu64 224(%rsi),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm4,%zmm1 + vpclmulqdq $0x00,%zmm19,%zmm4,%zmm9 + vpclmulqdq $0x01,%zmm19,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm19,%zmm4,%zmm12 + vmovdqu64 288(%rsi),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm5,%zmm13 + vpclmulqdq $0x00,%zmm19,%zmm5,%zmm15 + vpclmulqdq $0x01,%zmm19,%zmm5,%zmm17 + vpclmulqdq $0x10,%zmm19,%zmm5,%zmm18 + + vpternlogq $0x96,%zmm17,%zmm10,%zmm7 + vpternlogq $0x96,%zmm13,%zmm1,%zmm6 + vpternlogq $0x96,%zmm15,%zmm9,%zmm8 + vpternlogq $0x96,%zmm18,%zmm12,%zmm7 + + vpsrldq $8,%zmm7,%zmm1 + vpslldq $8,%zmm7,%zmm9 + vpxorq %zmm1,%zmm6,%zmm6 + vpxorq %zmm9,%zmm8,%zmm8 + vextracti64x4 $1,%zmm6,%ymm1 + vpxorq %ymm1,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm1 + vpxorq %xmm1,%xmm6,%xmm6 + vextracti64x4 $1,%zmm8,%ymm9 + vpxorq %ymm9,%ymm8,%ymm8 + vextracti32x4 $1,%ymm8,%xmm9 + vpxorq %xmm9,%xmm8,%xmm8 + vmovdqa64 POLY2(%rip),%xmm10 + + + vpclmulqdq $0x01,%xmm8,%xmm10,%xmm1 + vpslldq $8,%xmm1,%xmm1 + vpxorq %xmm1,%xmm8,%xmm1 + + + vpclmulqdq $0x00,%xmm1,%xmm10,%xmm9 + vpsrldq $4,%xmm9,%xmm9 + vpclmulqdq $0x10,%xmm1,%xmm10,%xmm2 + vpslldq $4,%xmm2,%xmm2 + vpternlogq $0x96,%xmm6,%xmm9,%xmm2 + + subq $256,%r11 + je .L_CALC_AAD_done_mBgdvxqgFGebeug + + addq $256,%r10 + +.L_less_than_16x16_mBgdvxqgFGebeug: + + leaq byte64_len_to_mask_table(%rip),%r12 + leaq (%r12,%r11,8),%r12 + + + addl $15,%r11d + shrl $4,%r11d + cmpl $2,%r11d + jb .L_AAD_blocks_1_mBgdvxqgFGebeug + je .L_AAD_blocks_2_mBgdvxqgFGebeug + cmpl $4,%r11d + jb .L_AAD_blocks_3_mBgdvxqgFGebeug + je .L_AAD_blocks_4_mBgdvxqgFGebeug + cmpl $6,%r11d + jb .L_AAD_blocks_5_mBgdvxqgFGebeug + je .L_AAD_blocks_6_mBgdvxqgFGebeug + cmpl $8,%r11d + jb .L_AAD_blocks_7_mBgdvxqgFGebeug + je .L_AAD_blocks_8_mBgdvxqgFGebeug + cmpl $10,%r11d + jb .L_AAD_blocks_9_mBgdvxqgFGebeug + je .L_AAD_blocks_10_mBgdvxqgFGebeug + cmpl $12,%r11d + jb .L_AAD_blocks_11_mBgdvxqgFGebeug + je .L_AAD_blocks_12_mBgdvxqgFGebeug + cmpl $14,%r11d + jb .L_AAD_blocks_13_mBgdvxqgFGebeug + je .L_AAD_blocks_14_mBgdvxqgFGebeug + cmpl $15,%r11d + je .L_AAD_blocks_15_mBgdvxqgFGebeug +.L_AAD_blocks_16_mBgdvxqgFGebeug: + subq $1536,%r12 + kmovq (%r12),%k1 + vmovdqu8 0(%r10),%zmm11 + vmovdqu8 64(%r10),%zmm3 + vmovdqu8 128(%r10),%zmm4 + vmovdqu8 192(%r10),%zmm5{%k1}{z} + vpshufb %zmm16,%zmm11,%zmm11 + vpshufb %zmm16,%zmm3,%zmm3 + vpshufb %zmm16,%zmm4,%zmm4 + vpshufb %zmm16,%zmm5,%zmm5 + vpxorq %zmm2,%zmm11,%zmm11 + vmovdqu64 96(%rsi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1 + vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6 + vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7 + vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8 + vmovdqu64 160(%rsi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm3,%zmm9 + vpclmulqdq $0x00,%zmm15,%zmm3,%zmm10 + vpclmulqdq $0x01,%zmm15,%zmm3,%zmm12 + vpclmulqdq $0x10,%zmm15,%zmm3,%zmm13 + vmovdqu64 224(%rsi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm4,%zmm11 + vpclmulqdq $0x00,%zmm15,%zmm4,%zmm3 + vpternlogq $0x96,%zmm9,%zmm11,%zmm1 + vpternlogq $0x96,%zmm10,%zmm3,%zmm6 + vpclmulqdq $0x01,%zmm15,%zmm4,%zmm11 + vpclmulqdq $0x10,%zmm15,%zmm4,%zmm3 + vpternlogq $0x96,%zmm12,%zmm11,%zmm7 + vpternlogq $0x96,%zmm13,%zmm3,%zmm8 + vmovdqu64 288(%rsi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm5,%zmm9 + vpclmulqdq $0x00,%zmm15,%zmm5,%zmm10 + vpclmulqdq $0x01,%zmm15,%zmm5,%zmm12 + vpclmulqdq $0x10,%zmm15,%zmm5,%zmm13 + vpxorq %zmm9,%zmm1,%zmm9 + vpxorq %zmm10,%zmm6,%zmm10 + vpxorq %zmm12,%zmm7,%zmm12 + vpxorq %zmm13,%zmm8,%zmm13 + + vpxorq %zmm13,%zmm12,%zmm12 + vpsrldq $8,%zmm12,%zmm7 + vpslldq $8,%zmm12,%zmm8 + vpxorq %zmm7,%zmm9,%zmm1 + vpxorq %zmm8,%zmm10,%zmm6 + vextracti64x4 $1,%zmm1,%ymm12 + vpxorq %ymm12,%ymm1,%ymm1 + vextracti32x4 $1,%ymm1,%xmm12 + vpxorq %xmm12,%xmm1,%xmm1 + vextracti64x4 $1,%zmm6,%ymm13 + vpxorq %ymm13,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm13 + vpxorq %xmm13,%xmm6,%xmm6 + vmovdqa64 POLY2(%rip),%xmm15 + + + vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7 + vpslldq $8,%xmm7,%xmm7 + vpxorq %xmm7,%xmm6,%xmm7 + + + vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8 + vpsrldq $4,%xmm8,%xmm8 + vpclmulqdq $0x10,%xmm7,%xmm15,%xmm2 + vpslldq $4,%xmm2,%xmm2 + vpternlogq $0x96,%xmm1,%xmm8,%xmm2 + + jmp .L_CALC_AAD_done_mBgdvxqgFGebeug +.L_AAD_blocks_15_mBgdvxqgFGebeug: + subq $1536,%r12 + kmovq (%r12),%k1 + vmovdqu8 0(%r10),%zmm11 + vmovdqu8 64(%r10),%zmm3 + vmovdqu8 128(%r10),%zmm4 + vmovdqu8 192(%r10),%zmm5{%k1}{z} + vpshufb %zmm16,%zmm11,%zmm11 + vpshufb %zmm16,%zmm3,%zmm3 + vpshufb %zmm16,%zmm4,%zmm4 + vpshufb %zmm16,%zmm5,%zmm5 + vpxorq %zmm2,%zmm11,%zmm11 + vmovdqu64 112(%rsi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1 + vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6 + vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7 + vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8 + vmovdqu64 176(%rsi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm3,%zmm9 + vpclmulqdq $0x00,%zmm15,%zmm3,%zmm10 + vpclmulqdq $0x01,%zmm15,%zmm3,%zmm12 + vpclmulqdq $0x10,%zmm15,%zmm3,%zmm13 + vmovdqu64 240(%rsi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm4,%zmm11 + vpclmulqdq $0x00,%zmm15,%zmm4,%zmm3 + vpternlogq $0x96,%zmm1,%zmm11,%zmm9 + vpternlogq $0x96,%zmm6,%zmm3,%zmm10 + vpclmulqdq $0x01,%zmm15,%zmm4,%zmm11 + vpclmulqdq $0x10,%zmm15,%zmm4,%zmm3 + vpternlogq $0x96,%zmm7,%zmm11,%zmm12 + vpternlogq $0x96,%zmm8,%zmm3,%zmm13 + vmovdqu64 304(%rsi),%ymm15 + vinserti64x2 $2,336(%rsi),%zmm15,%zmm15 + vpclmulqdq $0x01,%zmm15,%zmm5,%zmm7 + vpclmulqdq $0x10,%zmm15,%zmm5,%zmm8 + vpclmulqdq $0x11,%zmm15,%zmm5,%zmm1 + vpclmulqdq $0x00,%zmm15,%zmm5,%zmm6 + + vpxorq %zmm12,%zmm7,%zmm7 + vpxorq %zmm13,%zmm8,%zmm8 + vpxorq %zmm9,%zmm1,%zmm1 + vpxorq %zmm10,%zmm6,%zmm6 + + vpxorq %zmm8,%zmm7,%zmm7 + vpsrldq $8,%zmm7,%zmm12 + vpslldq $8,%zmm7,%zmm13 + vpxorq %zmm12,%zmm1,%zmm1 + vpxorq %zmm13,%zmm6,%zmm6 + vextracti64x4 $1,%zmm1,%ymm12 + vpxorq %ymm12,%ymm1,%ymm1 + vextracti32x4 $1,%ymm1,%xmm12 + vpxorq %xmm12,%xmm1,%xmm1 + vextracti64x4 $1,%zmm6,%ymm13 + vpxorq %ymm13,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm13 + vpxorq %xmm13,%xmm6,%xmm6 + vmovdqa64 POLY2(%rip),%xmm15 + + + vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7 + vpslldq $8,%xmm7,%xmm7 + vpxorq %xmm7,%xmm6,%xmm7 + + + vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8 + vpsrldq $4,%xmm8,%xmm8 + vpclmulqdq $0x10,%xmm7,%xmm15,%xmm2 + vpslldq $4,%xmm2,%xmm2 + vpternlogq $0x96,%xmm1,%xmm8,%xmm2 + + jmp .L_CALC_AAD_done_mBgdvxqgFGebeug +.L_AAD_blocks_14_mBgdvxqgFGebeug: + subq $1536,%r12 + kmovq (%r12),%k1 + vmovdqu8 0(%r10),%zmm11 + vmovdqu8 64(%r10),%zmm3 + vmovdqu8 128(%r10),%zmm4 + vmovdqu8 192(%r10),%ymm5{%k1}{z} + vpshufb %zmm16,%zmm11,%zmm11 + vpshufb %zmm16,%zmm3,%zmm3 + vpshufb %zmm16,%zmm4,%zmm4 + vpshufb %ymm16,%ymm5,%ymm5 + vpxorq %zmm2,%zmm11,%zmm11 + vmovdqu64 128(%rsi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1 + vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6 + vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7 + vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8 + vmovdqu64 192(%rsi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm3,%zmm9 + vpclmulqdq $0x00,%zmm15,%zmm3,%zmm10 + vpclmulqdq $0x01,%zmm15,%zmm3,%zmm12 + vpclmulqdq $0x10,%zmm15,%zmm3,%zmm13 + vmovdqu64 256(%rsi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm4,%zmm11 + vpclmulqdq $0x00,%zmm15,%zmm4,%zmm3 + vpternlogq $0x96,%zmm1,%zmm11,%zmm9 + vpternlogq $0x96,%zmm6,%zmm3,%zmm10 + vpclmulqdq $0x01,%zmm15,%zmm4,%zmm11 + vpclmulqdq $0x10,%zmm15,%zmm4,%zmm3 + vpternlogq $0x96,%zmm7,%zmm11,%zmm12 + vpternlogq $0x96,%zmm8,%zmm3,%zmm13 + vmovdqu64 320(%rsi),%ymm15 + vpclmulqdq $0x01,%ymm15,%ymm5,%ymm7 + vpclmulqdq $0x10,%ymm15,%ymm5,%ymm8 + vpclmulqdq $0x11,%ymm15,%ymm5,%ymm1 + vpclmulqdq $0x00,%ymm15,%ymm5,%ymm6 + + vpxorq %zmm12,%zmm7,%zmm7 + vpxorq %zmm13,%zmm8,%zmm8 + vpxorq %zmm9,%zmm1,%zmm1 + vpxorq %zmm10,%zmm6,%zmm6 + + vpxorq %zmm8,%zmm7,%zmm7 + vpsrldq $8,%zmm7,%zmm12 + vpslldq $8,%zmm7,%zmm13 + vpxorq %zmm12,%zmm1,%zmm1 + vpxorq %zmm13,%zmm6,%zmm6 + vextracti64x4 $1,%zmm1,%ymm12 + vpxorq %ymm12,%ymm1,%ymm1 + vextracti32x4 $1,%ymm1,%xmm12 + vpxorq %xmm12,%xmm1,%xmm1 + vextracti64x4 $1,%zmm6,%ymm13 + vpxorq %ymm13,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm13 + vpxorq %xmm13,%xmm6,%xmm6 + vmovdqa64 POLY2(%rip),%xmm15 + + + vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7 + vpslldq $8,%xmm7,%xmm7 + vpxorq %xmm7,%xmm6,%xmm7 + + + vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8 + vpsrldq $4,%xmm8,%xmm8 + vpclmulqdq $0x10,%xmm7,%xmm15,%xmm2 + vpslldq $4,%xmm2,%xmm2 + vpternlogq $0x96,%xmm1,%xmm8,%xmm2 + + jmp .L_CALC_AAD_done_mBgdvxqgFGebeug +.L_AAD_blocks_13_mBgdvxqgFGebeug: + subq $1536,%r12 + kmovq (%r12),%k1 + vmovdqu8 0(%r10),%zmm11 + vmovdqu8 64(%r10),%zmm3 + vmovdqu8 128(%r10),%zmm4 + vmovdqu8 192(%r10),%xmm5{%k1}{z} + vpshufb %zmm16,%zmm11,%zmm11 + vpshufb %zmm16,%zmm3,%zmm3 + vpshufb %zmm16,%zmm4,%zmm4 + vpshufb %xmm16,%xmm5,%xmm5 + vpxorq %zmm2,%zmm11,%zmm11 + vmovdqu64 144(%rsi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1 + vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6 + vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7 + vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8 + vmovdqu64 208(%rsi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm3,%zmm9 + vpclmulqdq $0x00,%zmm15,%zmm3,%zmm10 + vpclmulqdq $0x01,%zmm15,%zmm3,%zmm12 + vpclmulqdq $0x10,%zmm15,%zmm3,%zmm13 + vmovdqu64 272(%rsi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm4,%zmm11 + vpclmulqdq $0x00,%zmm15,%zmm4,%zmm3 + vpternlogq $0x96,%zmm1,%zmm11,%zmm9 + vpternlogq $0x96,%zmm6,%zmm3,%zmm10 + vpclmulqdq $0x01,%zmm15,%zmm4,%zmm11 + vpclmulqdq $0x10,%zmm15,%zmm4,%zmm3 + vpternlogq $0x96,%zmm7,%zmm11,%zmm12 + vpternlogq $0x96,%zmm8,%zmm3,%zmm13 + vmovdqu64 336(%rsi),%xmm15 + vpclmulqdq $0x01,%xmm15,%xmm5,%xmm7 + vpclmulqdq $0x10,%xmm15,%xmm5,%xmm8 + vpclmulqdq $0x11,%xmm15,%xmm5,%xmm1 + vpclmulqdq $0x00,%xmm15,%xmm5,%xmm6 + + vpxorq %zmm12,%zmm7,%zmm7 + vpxorq %zmm13,%zmm8,%zmm8 + vpxorq %zmm9,%zmm1,%zmm1 + vpxorq %zmm10,%zmm6,%zmm6 + + vpxorq %zmm8,%zmm7,%zmm7 + vpsrldq $8,%zmm7,%zmm12 + vpslldq $8,%zmm7,%zmm13 + vpxorq %zmm12,%zmm1,%zmm1 + vpxorq %zmm13,%zmm6,%zmm6 + vextracti64x4 $1,%zmm1,%ymm12 + vpxorq %ymm12,%ymm1,%ymm1 + vextracti32x4 $1,%ymm1,%xmm12 + vpxorq %xmm12,%xmm1,%xmm1 + vextracti64x4 $1,%zmm6,%ymm13 + vpxorq %ymm13,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm13 + vpxorq %xmm13,%xmm6,%xmm6 + vmovdqa64 POLY2(%rip),%xmm15 + + + vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7 + vpslldq $8,%xmm7,%xmm7 + vpxorq %xmm7,%xmm6,%xmm7 + + + vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8 + vpsrldq $4,%xmm8,%xmm8 + vpclmulqdq $0x10,%xmm7,%xmm15,%xmm2 + vpslldq $4,%xmm2,%xmm2 + vpternlogq $0x96,%xmm1,%xmm8,%xmm2 + + jmp .L_CALC_AAD_done_mBgdvxqgFGebeug +.L_AAD_blocks_12_mBgdvxqgFGebeug: + subq $1024,%r12 + kmovq (%r12),%k1 + vmovdqu8 0(%r10),%zmm11 + vmovdqu8 64(%r10),%zmm3 + vmovdqu8 128(%r10),%zmm4{%k1}{z} + vpshufb %zmm16,%zmm11,%zmm11 + vpshufb %zmm16,%zmm3,%zmm3 + vpshufb %zmm16,%zmm4,%zmm4 + vpxorq %zmm2,%zmm11,%zmm11 + vmovdqu64 160(%rsi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1 + vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6 + vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7 + vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8 + vmovdqu64 224(%rsi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm3,%zmm9 + vpclmulqdq $0x00,%zmm15,%zmm3,%zmm10 + vpclmulqdq $0x01,%zmm15,%zmm3,%zmm12 + vpclmulqdq $0x10,%zmm15,%zmm3,%zmm13 + vmovdqu64 288(%rsi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm4,%zmm11 + vpclmulqdq $0x00,%zmm15,%zmm4,%zmm3 + vpternlogq $0x96,%zmm1,%zmm11,%zmm9 + vpternlogq $0x96,%zmm6,%zmm3,%zmm10 + vpclmulqdq $0x01,%zmm15,%zmm4,%zmm11 + vpclmulqdq $0x10,%zmm15,%zmm4,%zmm3 + vpternlogq $0x96,%zmm7,%zmm11,%zmm12 + vpternlogq $0x96,%zmm8,%zmm3,%zmm13 + + vpxorq %zmm13,%zmm12,%zmm12 + vpsrldq $8,%zmm12,%zmm7 + vpslldq $8,%zmm12,%zmm8 + vpxorq %zmm7,%zmm9,%zmm1 + vpxorq %zmm8,%zmm10,%zmm6 + vextracti64x4 $1,%zmm1,%ymm12 + vpxorq %ymm12,%ymm1,%ymm1 + vextracti32x4 $1,%ymm1,%xmm12 + vpxorq %xmm12,%xmm1,%xmm1 + vextracti64x4 $1,%zmm6,%ymm13 + vpxorq %ymm13,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm13 + vpxorq %xmm13,%xmm6,%xmm6 + vmovdqa64 POLY2(%rip),%xmm15 + + + vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7 + vpslldq $8,%xmm7,%xmm7 + vpxorq %xmm7,%xmm6,%xmm7 + + + vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8 + vpsrldq $4,%xmm8,%xmm8 + vpclmulqdq $0x10,%xmm7,%xmm15,%xmm2 + vpslldq $4,%xmm2,%xmm2 + vpternlogq $0x96,%xmm1,%xmm8,%xmm2 + + jmp .L_CALC_AAD_done_mBgdvxqgFGebeug +.L_AAD_blocks_11_mBgdvxqgFGebeug: + subq $1024,%r12 + kmovq (%r12),%k1 + vmovdqu8 0(%r10),%zmm11 + vmovdqu8 64(%r10),%zmm3 + vmovdqu8 128(%r10),%zmm4{%k1}{z} + vpshufb %zmm16,%zmm11,%zmm11 + vpshufb %zmm16,%zmm3,%zmm3 + vpshufb %zmm16,%zmm4,%zmm4 + vpxorq %zmm2,%zmm11,%zmm11 + vmovdqu64 176(%rsi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1 + vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6 + vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7 + vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8 + vmovdqu64 240(%rsi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm3,%zmm9 + vpclmulqdq $0x00,%zmm15,%zmm3,%zmm10 + vpclmulqdq $0x01,%zmm15,%zmm3,%zmm12 + vpclmulqdq $0x10,%zmm15,%zmm3,%zmm13 + vpxorq %zmm9,%zmm1,%zmm9 + vpxorq %zmm10,%zmm6,%zmm10 + vpxorq %zmm12,%zmm7,%zmm12 + vpxorq %zmm13,%zmm8,%zmm13 + vmovdqu64 304(%rsi),%ymm15 + vinserti64x2 $2,336(%rsi),%zmm15,%zmm15 + vpclmulqdq $0x01,%zmm15,%zmm4,%zmm7 + vpclmulqdq $0x10,%zmm15,%zmm4,%zmm8 + vpclmulqdq $0x11,%zmm15,%zmm4,%zmm1 + vpclmulqdq $0x00,%zmm15,%zmm4,%zmm6 + + vpxorq %zmm12,%zmm7,%zmm7 + vpxorq %zmm13,%zmm8,%zmm8 + vpxorq %zmm9,%zmm1,%zmm1 + vpxorq %zmm10,%zmm6,%zmm6 + + vpxorq %zmm8,%zmm7,%zmm7 + vpsrldq $8,%zmm7,%zmm12 + vpslldq $8,%zmm7,%zmm13 + vpxorq %zmm12,%zmm1,%zmm1 + vpxorq %zmm13,%zmm6,%zmm6 + vextracti64x4 $1,%zmm1,%ymm12 + vpxorq %ymm12,%ymm1,%ymm1 + vextracti32x4 $1,%ymm1,%xmm12 + vpxorq %xmm12,%xmm1,%xmm1 + vextracti64x4 $1,%zmm6,%ymm13 + vpxorq %ymm13,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm13 + vpxorq %xmm13,%xmm6,%xmm6 + vmovdqa64 POLY2(%rip),%xmm15 + + + vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7 + vpslldq $8,%xmm7,%xmm7 + vpxorq %xmm7,%xmm6,%xmm7 + + + vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8 + vpsrldq $4,%xmm8,%xmm8 + vpclmulqdq $0x10,%xmm7,%xmm15,%xmm2 + vpslldq $4,%xmm2,%xmm2 + vpternlogq $0x96,%xmm1,%xmm8,%xmm2 + + jmp .L_CALC_AAD_done_mBgdvxqgFGebeug +.L_AAD_blocks_10_mBgdvxqgFGebeug: + subq $1024,%r12 + kmovq (%r12),%k1 + vmovdqu8 0(%r10),%zmm11 + vmovdqu8 64(%r10),%zmm3 + vmovdqu8 128(%r10),%ymm4{%k1}{z} + vpshufb %zmm16,%zmm11,%zmm11 + vpshufb %zmm16,%zmm3,%zmm3 + vpshufb %ymm16,%ymm4,%ymm4 + vpxorq %zmm2,%zmm11,%zmm11 + vmovdqu64 192(%rsi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1 + vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6 + vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7 + vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8 + vmovdqu64 256(%rsi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm3,%zmm9 + vpclmulqdq $0x00,%zmm15,%zmm3,%zmm10 + vpclmulqdq $0x01,%zmm15,%zmm3,%zmm12 + vpclmulqdq $0x10,%zmm15,%zmm3,%zmm13 + vpxorq %zmm9,%zmm1,%zmm9 + vpxorq %zmm10,%zmm6,%zmm10 + vpxorq %zmm12,%zmm7,%zmm12 + vpxorq %zmm13,%zmm8,%zmm13 + vmovdqu64 320(%rsi),%ymm15 + vpclmulqdq $0x01,%ymm15,%ymm4,%ymm7 + vpclmulqdq $0x10,%ymm15,%ymm4,%ymm8 + vpclmulqdq $0x11,%ymm15,%ymm4,%ymm1 + vpclmulqdq $0x00,%ymm15,%ymm4,%ymm6 + + vpxorq %zmm12,%zmm7,%zmm7 + vpxorq %zmm13,%zmm8,%zmm8 + vpxorq %zmm9,%zmm1,%zmm1 + vpxorq %zmm10,%zmm6,%zmm6 + + vpxorq %zmm8,%zmm7,%zmm7 + vpsrldq $8,%zmm7,%zmm12 + vpslldq $8,%zmm7,%zmm13 + vpxorq %zmm12,%zmm1,%zmm1 + vpxorq %zmm13,%zmm6,%zmm6 + vextracti64x4 $1,%zmm1,%ymm12 + vpxorq %ymm12,%ymm1,%ymm1 + vextracti32x4 $1,%ymm1,%xmm12 + vpxorq %xmm12,%xmm1,%xmm1 + vextracti64x4 $1,%zmm6,%ymm13 + vpxorq %ymm13,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm13 + vpxorq %xmm13,%xmm6,%xmm6 + vmovdqa64 POLY2(%rip),%xmm15 + + + vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7 + vpslldq $8,%xmm7,%xmm7 + vpxorq %xmm7,%xmm6,%xmm7 + + + vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8 + vpsrldq $4,%xmm8,%xmm8 + vpclmulqdq $0x10,%xmm7,%xmm15,%xmm2 + vpslldq $4,%xmm2,%xmm2 + vpternlogq $0x96,%xmm1,%xmm8,%xmm2 + + jmp .L_CALC_AAD_done_mBgdvxqgFGebeug +.L_AAD_blocks_9_mBgdvxqgFGebeug: + subq $1024,%r12 + kmovq (%r12),%k1 + vmovdqu8 0(%r10),%zmm11 + vmovdqu8 64(%r10),%zmm3 + vmovdqu8 128(%r10),%xmm4{%k1}{z} + vpshufb %zmm16,%zmm11,%zmm11 + vpshufb %zmm16,%zmm3,%zmm3 + vpshufb %xmm16,%xmm4,%xmm4 + vpxorq %zmm2,%zmm11,%zmm11 + vmovdqu64 208(%rsi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1 + vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6 + vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7 + vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8 + vmovdqu64 272(%rsi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm3,%zmm9 + vpclmulqdq $0x00,%zmm15,%zmm3,%zmm10 + vpclmulqdq $0x01,%zmm15,%zmm3,%zmm12 + vpclmulqdq $0x10,%zmm15,%zmm3,%zmm13 + vpxorq %zmm9,%zmm1,%zmm9 + vpxorq %zmm10,%zmm6,%zmm10 + vpxorq %zmm12,%zmm7,%zmm12 + vpxorq %zmm13,%zmm8,%zmm13 + vmovdqu64 336(%rsi),%xmm15 + vpclmulqdq $0x01,%xmm15,%xmm4,%xmm7 + vpclmulqdq $0x10,%xmm15,%xmm4,%xmm8 + vpclmulqdq $0x11,%xmm15,%xmm4,%xmm1 + vpclmulqdq $0x00,%xmm15,%xmm4,%xmm6 + + vpxorq %zmm12,%zmm7,%zmm7 + vpxorq %zmm13,%zmm8,%zmm8 + vpxorq %zmm9,%zmm1,%zmm1 + vpxorq %zmm10,%zmm6,%zmm6 + + vpxorq %zmm8,%zmm7,%zmm7 + vpsrldq $8,%zmm7,%zmm12 + vpslldq $8,%zmm7,%zmm13 + vpxorq %zmm12,%zmm1,%zmm1 + vpxorq %zmm13,%zmm6,%zmm6 + vextracti64x4 $1,%zmm1,%ymm12 + vpxorq %ymm12,%ymm1,%ymm1 + vextracti32x4 $1,%ymm1,%xmm12 + vpxorq %xmm12,%xmm1,%xmm1 + vextracti64x4 $1,%zmm6,%ymm13 + vpxorq %ymm13,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm13 + vpxorq %xmm13,%xmm6,%xmm6 + vmovdqa64 POLY2(%rip),%xmm15 + + + vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7 + vpslldq $8,%xmm7,%xmm7 + vpxorq %xmm7,%xmm6,%xmm7 + + + vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8 + vpsrldq $4,%xmm8,%xmm8 + vpclmulqdq $0x10,%xmm7,%xmm15,%xmm2 + vpslldq $4,%xmm2,%xmm2 + vpternlogq $0x96,%xmm1,%xmm8,%xmm2 + + jmp .L_CALC_AAD_done_mBgdvxqgFGebeug +.L_AAD_blocks_8_mBgdvxqgFGebeug: + subq $512,%r12 + kmovq (%r12),%k1 + vmovdqu8 0(%r10),%zmm11 + vmovdqu8 64(%r10),%zmm3{%k1}{z} + vpshufb %zmm16,%zmm11,%zmm11 + vpshufb %zmm16,%zmm3,%zmm3 + vpxorq %zmm2,%zmm11,%zmm11 + vmovdqu64 224(%rsi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1 + vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6 + vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7 + vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8 + vmovdqu64 288(%rsi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm3,%zmm9 + vpclmulqdq $0x00,%zmm15,%zmm3,%zmm10 + vpclmulqdq $0x01,%zmm15,%zmm3,%zmm12 + vpclmulqdq $0x10,%zmm15,%zmm3,%zmm13 + vpxorq %zmm9,%zmm1,%zmm9 + vpxorq %zmm10,%zmm6,%zmm10 + vpxorq %zmm12,%zmm7,%zmm12 + vpxorq %zmm13,%zmm8,%zmm13 + + vpxorq %zmm13,%zmm12,%zmm12 + vpsrldq $8,%zmm12,%zmm7 + vpslldq $8,%zmm12,%zmm8 + vpxorq %zmm7,%zmm9,%zmm1 + vpxorq %zmm8,%zmm10,%zmm6 + vextracti64x4 $1,%zmm1,%ymm12 + vpxorq %ymm12,%ymm1,%ymm1 + vextracti32x4 $1,%ymm1,%xmm12 + vpxorq %xmm12,%xmm1,%xmm1 + vextracti64x4 $1,%zmm6,%ymm13 + vpxorq %ymm13,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm13 + vpxorq %xmm13,%xmm6,%xmm6 + vmovdqa64 POLY2(%rip),%xmm15 + + + vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7 + vpslldq $8,%xmm7,%xmm7 + vpxorq %xmm7,%xmm6,%xmm7 + + + vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8 + vpsrldq $4,%xmm8,%xmm8 + vpclmulqdq $0x10,%xmm7,%xmm15,%xmm2 + vpslldq $4,%xmm2,%xmm2 + vpternlogq $0x96,%xmm1,%xmm8,%xmm2 + + jmp .L_CALC_AAD_done_mBgdvxqgFGebeug +.L_AAD_blocks_7_mBgdvxqgFGebeug: + subq $512,%r12 + kmovq (%r12),%k1 + vmovdqu8 0(%r10),%zmm11 + vmovdqu8 64(%r10),%zmm3{%k1}{z} + vpshufb %zmm16,%zmm11,%zmm11 + vpshufb %zmm16,%zmm3,%zmm3 + vpxorq %zmm2,%zmm11,%zmm11 + vmovdqu64 240(%rsi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm11,%zmm9 + vpclmulqdq $0x00,%zmm15,%zmm11,%zmm10 + vpclmulqdq $0x01,%zmm15,%zmm11,%zmm12 + vpclmulqdq $0x10,%zmm15,%zmm11,%zmm13 + vmovdqu64 304(%rsi),%ymm15 + vinserti64x2 $2,336(%rsi),%zmm15,%zmm15 + vpclmulqdq $0x01,%zmm15,%zmm3,%zmm7 + vpclmulqdq $0x10,%zmm15,%zmm3,%zmm8 + vpclmulqdq $0x11,%zmm15,%zmm3,%zmm1 + vpclmulqdq $0x00,%zmm15,%zmm3,%zmm6 + + vpxorq %zmm12,%zmm7,%zmm7 + vpxorq %zmm13,%zmm8,%zmm8 + vpxorq %zmm9,%zmm1,%zmm1 + vpxorq %zmm10,%zmm6,%zmm6 + + vpxorq %zmm8,%zmm7,%zmm7 + vpsrldq $8,%zmm7,%zmm12 + vpslldq $8,%zmm7,%zmm13 + vpxorq %zmm12,%zmm1,%zmm1 + vpxorq %zmm13,%zmm6,%zmm6 + vextracti64x4 $1,%zmm1,%ymm12 + vpxorq %ymm12,%ymm1,%ymm1 + vextracti32x4 $1,%ymm1,%xmm12 + vpxorq %xmm12,%xmm1,%xmm1 + vextracti64x4 $1,%zmm6,%ymm13 + vpxorq %ymm13,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm13 + vpxorq %xmm13,%xmm6,%xmm6 + vmovdqa64 POLY2(%rip),%xmm15 + + + vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7 + vpslldq $8,%xmm7,%xmm7 + vpxorq %xmm7,%xmm6,%xmm7 + + + vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8 + vpsrldq $4,%xmm8,%xmm8 + vpclmulqdq $0x10,%xmm7,%xmm15,%xmm2 + vpslldq $4,%xmm2,%xmm2 + vpternlogq $0x96,%xmm1,%xmm8,%xmm2 + + jmp .L_CALC_AAD_done_mBgdvxqgFGebeug +.L_AAD_blocks_6_mBgdvxqgFGebeug: + subq $512,%r12 + kmovq (%r12),%k1 + vmovdqu8 0(%r10),%zmm11 + vmovdqu8 64(%r10),%ymm3{%k1}{z} + vpshufb %zmm16,%zmm11,%zmm11 + vpshufb %ymm16,%ymm3,%ymm3 + vpxorq %zmm2,%zmm11,%zmm11 + vmovdqu64 256(%rsi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm11,%zmm9 + vpclmulqdq $0x00,%zmm15,%zmm11,%zmm10 + vpclmulqdq $0x01,%zmm15,%zmm11,%zmm12 + vpclmulqdq $0x10,%zmm15,%zmm11,%zmm13 + vmovdqu64 320(%rsi),%ymm15 + vpclmulqdq $0x01,%ymm15,%ymm3,%ymm7 + vpclmulqdq $0x10,%ymm15,%ymm3,%ymm8 + vpclmulqdq $0x11,%ymm15,%ymm3,%ymm1 + vpclmulqdq $0x00,%ymm15,%ymm3,%ymm6 + + vpxorq %zmm12,%zmm7,%zmm7 + vpxorq %zmm13,%zmm8,%zmm8 + vpxorq %zmm9,%zmm1,%zmm1 + vpxorq %zmm10,%zmm6,%zmm6 + + vpxorq %zmm8,%zmm7,%zmm7 + vpsrldq $8,%zmm7,%zmm12 + vpslldq $8,%zmm7,%zmm13 + vpxorq %zmm12,%zmm1,%zmm1 + vpxorq %zmm13,%zmm6,%zmm6 + vextracti64x4 $1,%zmm1,%ymm12 + vpxorq %ymm12,%ymm1,%ymm1 + vextracti32x4 $1,%ymm1,%xmm12 + vpxorq %xmm12,%xmm1,%xmm1 + vextracti64x4 $1,%zmm6,%ymm13 + vpxorq %ymm13,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm13 + vpxorq %xmm13,%xmm6,%xmm6 + vmovdqa64 POLY2(%rip),%xmm15 + + + vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7 + vpslldq $8,%xmm7,%xmm7 + vpxorq %xmm7,%xmm6,%xmm7 + + + vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8 + vpsrldq $4,%xmm8,%xmm8 + vpclmulqdq $0x10,%xmm7,%xmm15,%xmm2 + vpslldq $4,%xmm2,%xmm2 + vpternlogq $0x96,%xmm1,%xmm8,%xmm2 + + jmp .L_CALC_AAD_done_mBgdvxqgFGebeug +.L_AAD_blocks_5_mBgdvxqgFGebeug: + subq $512,%r12 + kmovq (%r12),%k1 + vmovdqu8 0(%r10),%zmm11 + vmovdqu8 64(%r10),%xmm3{%k1}{z} + vpshufb %zmm16,%zmm11,%zmm11 + vpshufb %xmm16,%xmm3,%xmm3 + vpxorq %zmm2,%zmm11,%zmm11 + vmovdqu64 272(%rsi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm11,%zmm9 + vpclmulqdq $0x00,%zmm15,%zmm11,%zmm10 + vpclmulqdq $0x01,%zmm15,%zmm11,%zmm12 + vpclmulqdq $0x10,%zmm15,%zmm11,%zmm13 + vmovdqu64 336(%rsi),%xmm15 + vpclmulqdq $0x01,%xmm15,%xmm3,%xmm7 + vpclmulqdq $0x10,%xmm15,%xmm3,%xmm8 + vpclmulqdq $0x11,%xmm15,%xmm3,%xmm1 + vpclmulqdq $0x00,%xmm15,%xmm3,%xmm6 + + vpxorq %zmm12,%zmm7,%zmm7 + vpxorq %zmm13,%zmm8,%zmm8 + vpxorq %zmm9,%zmm1,%zmm1 + vpxorq %zmm10,%zmm6,%zmm6 + + vpxorq %zmm8,%zmm7,%zmm7 + vpsrldq $8,%zmm7,%zmm12 + vpslldq $8,%zmm7,%zmm13 + vpxorq %zmm12,%zmm1,%zmm1 + vpxorq %zmm13,%zmm6,%zmm6 + vextracti64x4 $1,%zmm1,%ymm12 + vpxorq %ymm12,%ymm1,%ymm1 + vextracti32x4 $1,%ymm1,%xmm12 + vpxorq %xmm12,%xmm1,%xmm1 + vextracti64x4 $1,%zmm6,%ymm13 + vpxorq %ymm13,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm13 + vpxorq %xmm13,%xmm6,%xmm6 + vmovdqa64 POLY2(%rip),%xmm15 + + + vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7 + vpslldq $8,%xmm7,%xmm7 + vpxorq %xmm7,%xmm6,%xmm7 + + + vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8 + vpsrldq $4,%xmm8,%xmm8 + vpclmulqdq $0x10,%xmm7,%xmm15,%xmm2 + vpslldq $4,%xmm2,%xmm2 + vpternlogq $0x96,%xmm1,%xmm8,%xmm2 + + jmp .L_CALC_AAD_done_mBgdvxqgFGebeug +.L_AAD_blocks_4_mBgdvxqgFGebeug: + kmovq (%r12),%k1 + vmovdqu8 0(%r10),%zmm11{%k1}{z} + vpshufb %zmm16,%zmm11,%zmm11 + vpxorq %zmm2,%zmm11,%zmm11 + vmovdqu64 288(%rsi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm11,%zmm9 + vpclmulqdq $0x00,%zmm15,%zmm11,%zmm10 + vpclmulqdq $0x01,%zmm15,%zmm11,%zmm12 + vpclmulqdq $0x10,%zmm15,%zmm11,%zmm13 + + vpxorq %zmm13,%zmm12,%zmm12 + vpsrldq $8,%zmm12,%zmm7 + vpslldq $8,%zmm12,%zmm8 + vpxorq %zmm7,%zmm9,%zmm1 + vpxorq %zmm8,%zmm10,%zmm6 + vextracti64x4 $1,%zmm1,%ymm12 + vpxorq %ymm12,%ymm1,%ymm1 + vextracti32x4 $1,%ymm1,%xmm12 + vpxorq %xmm12,%xmm1,%xmm1 + vextracti64x4 $1,%zmm6,%ymm13 + vpxorq %ymm13,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm13 + vpxorq %xmm13,%xmm6,%xmm6 + vmovdqa64 POLY2(%rip),%xmm15 + + + vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7 + vpslldq $8,%xmm7,%xmm7 + vpxorq %xmm7,%xmm6,%xmm7 + + + vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8 + vpsrldq $4,%xmm8,%xmm8 + vpclmulqdq $0x10,%xmm7,%xmm15,%xmm2 + vpslldq $4,%xmm2,%xmm2 + vpternlogq $0x96,%xmm1,%xmm8,%xmm2 + + jmp .L_CALC_AAD_done_mBgdvxqgFGebeug +.L_AAD_blocks_3_mBgdvxqgFGebeug: + kmovq (%r12),%k1 + vmovdqu8 0(%r10),%zmm11{%k1}{z} + vpshufb %zmm16,%zmm11,%zmm11 + vpxorq %zmm2,%zmm11,%zmm11 + vmovdqu64 304(%rsi),%ymm15 + vinserti64x2 $2,336(%rsi),%zmm15,%zmm15 + vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7 + vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8 + vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1 + vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6 + + vpxorq %zmm8,%zmm7,%zmm7 + vpsrldq $8,%zmm7,%zmm12 + vpslldq $8,%zmm7,%zmm13 + vpxorq %zmm12,%zmm1,%zmm1 + vpxorq %zmm13,%zmm6,%zmm6 + vextracti64x4 $1,%zmm1,%ymm12 + vpxorq %ymm12,%ymm1,%ymm1 + vextracti32x4 $1,%ymm1,%xmm12 + vpxorq %xmm12,%xmm1,%xmm1 + vextracti64x4 $1,%zmm6,%ymm13 + vpxorq %ymm13,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm13 + vpxorq %xmm13,%xmm6,%xmm6 + vmovdqa64 POLY2(%rip),%xmm15 + + + vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7 + vpslldq $8,%xmm7,%xmm7 + vpxorq %xmm7,%xmm6,%xmm7 + + + vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8 + vpsrldq $4,%xmm8,%xmm8 + vpclmulqdq $0x10,%xmm7,%xmm15,%xmm2 + vpslldq $4,%xmm2,%xmm2 + vpternlogq $0x96,%xmm1,%xmm8,%xmm2 + + jmp .L_CALC_AAD_done_mBgdvxqgFGebeug +.L_AAD_blocks_2_mBgdvxqgFGebeug: + kmovq (%r12),%k1 + vmovdqu8 0(%r10),%ymm11{%k1}{z} + vpshufb %ymm16,%ymm11,%ymm11 + vpxorq %zmm2,%zmm11,%zmm11 + vmovdqu64 320(%rsi),%ymm15 + vpclmulqdq $0x01,%ymm15,%ymm11,%ymm7 + vpclmulqdq $0x10,%ymm15,%ymm11,%ymm8 + vpclmulqdq $0x11,%ymm15,%ymm11,%ymm1 + vpclmulqdq $0x00,%ymm15,%ymm11,%ymm6 + + vpxorq %zmm8,%zmm7,%zmm7 + vpsrldq $8,%zmm7,%zmm12 + vpslldq $8,%zmm7,%zmm13 + vpxorq %zmm12,%zmm1,%zmm1 + vpxorq %zmm13,%zmm6,%zmm6 + vextracti64x4 $1,%zmm1,%ymm12 + vpxorq %ymm12,%ymm1,%ymm1 + vextracti32x4 $1,%ymm1,%xmm12 + vpxorq %xmm12,%xmm1,%xmm1 + vextracti64x4 $1,%zmm6,%ymm13 + vpxorq %ymm13,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm13 + vpxorq %xmm13,%xmm6,%xmm6 + vmovdqa64 POLY2(%rip),%xmm15 + + + vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7 + vpslldq $8,%xmm7,%xmm7 + vpxorq %xmm7,%xmm6,%xmm7 + + + vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8 + vpsrldq $4,%xmm8,%xmm8 + vpclmulqdq $0x10,%xmm7,%xmm15,%xmm2 + vpslldq $4,%xmm2,%xmm2 + vpternlogq $0x96,%xmm1,%xmm8,%xmm2 + + jmp .L_CALC_AAD_done_mBgdvxqgFGebeug +.L_AAD_blocks_1_mBgdvxqgFGebeug: + kmovq (%r12),%k1 + vmovdqu8 0(%r10),%xmm11{%k1}{z} + vpshufb %xmm16,%xmm11,%xmm11 + vpxorq %zmm2,%zmm11,%zmm11 + vmovdqu64 336(%rsi),%xmm15 + vpclmulqdq $0x01,%xmm15,%xmm11,%xmm7 + vpclmulqdq $0x10,%xmm15,%xmm11,%xmm8 + vpclmulqdq $0x11,%xmm15,%xmm11,%xmm1 + vpclmulqdq $0x00,%xmm15,%xmm11,%xmm6 + + vpxorq %zmm8,%zmm7,%zmm7 + vpsrldq $8,%zmm7,%zmm12 + vpslldq $8,%zmm7,%zmm13 + vpxorq %zmm12,%zmm1,%zmm1 + vpxorq %zmm13,%zmm6,%zmm6 + vextracti64x4 $1,%zmm1,%ymm12 + vpxorq %ymm12,%ymm1,%ymm1 + vextracti32x4 $1,%ymm1,%xmm12 + vpxorq %xmm12,%xmm1,%xmm1 + vextracti64x4 $1,%zmm6,%ymm13 + vpxorq %ymm13,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm13 + vpxorq %xmm13,%xmm6,%xmm6 + vmovdqa64 POLY2(%rip),%xmm15 + + + vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7 + vpslldq $8,%xmm7,%xmm7 + vpxorq %xmm7,%xmm6,%xmm7 + + + vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8 + vpsrldq $4,%xmm8,%xmm8 + vpclmulqdq $0x10,%xmm7,%xmm15,%xmm2 + vpslldq $4,%xmm2,%xmm2 + vpternlogq $0x96,%xmm1,%xmm8,%xmm2 + +.L_CALC_AAD_done_mBgdvxqgFGebeug: + movq %rcx,%r10 + shlq $3,%r10 + vmovq %r10,%xmm3 + + + vpxorq %xmm2,%xmm3,%xmm2 + + vmovdqu64 336(%rsi),%xmm1 + + vpclmulqdq $0x11,%xmm1,%xmm2,%xmm11 + vpclmulqdq $0x00,%xmm1,%xmm2,%xmm3 + vpclmulqdq $0x01,%xmm1,%xmm2,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm2,%xmm2 + vpxorq %xmm4,%xmm2,%xmm2 + + vpsrldq $8,%xmm2,%xmm4 + vpslldq $8,%xmm2,%xmm2 + vpxorq %xmm4,%xmm11,%xmm11 + vpxorq %xmm3,%xmm2,%xmm2 + + + + vmovdqu64 POLY2(%rip),%xmm4 + + vpclmulqdq $0x01,%xmm2,%xmm4,%xmm3 + vpslldq $8,%xmm3,%xmm3 + vpxorq %xmm3,%xmm2,%xmm2 + + + + vpclmulqdq $0x00,%xmm2,%xmm4,%xmm3 + vpsrldq $4,%xmm3,%xmm3 + vpclmulqdq $0x10,%xmm2,%xmm4,%xmm2 + vpslldq $4,%xmm2,%xmm2 + + vpternlogq $0x96,%xmm3,%xmm11,%xmm2 + + vpshufb SHUF_MASK(%rip),%xmm2,%xmm2 + jmp skip_iv_len_12_init_IV +iv_len_12_init_IV: + + vmovdqu8 ONEf(%rip),%xmm2 + movq %rdx,%r11 + movl $0x0000000000000fff,%r10d + kmovq %r10,%k1 + vmovdqu8 (%r11),%xmm2{%k1} +skip_iv_len_12_init_IV: + vmovdqu %xmm2,%xmm1 + + + movl 240(%rdi),%r10d + cmpl $9,%r10d + je .Laes_128_wbuuzwjyGbjeaox + cmpl $11,%r10d + je .Laes_192_wbuuzwjyGbjeaox + cmpl $13,%r10d + je .Laes_256_wbuuzwjyGbjeaox + jmp .Lexit_aes_wbuuzwjyGbjeaox +.align 32 +.Laes_128_wbuuzwjyGbjeaox: + vpxorq 0(%rdi),%xmm1,%xmm1 + + vaesenc 16(%rdi),%xmm1,%xmm1 + + vaesenc 32(%rdi),%xmm1,%xmm1 + + vaesenc 48(%rdi),%xmm1,%xmm1 + + vaesenc 64(%rdi),%xmm1,%xmm1 + + vaesenc 80(%rdi),%xmm1,%xmm1 + + vaesenc 96(%rdi),%xmm1,%xmm1 + + vaesenc 112(%rdi),%xmm1,%xmm1 + + vaesenc 128(%rdi),%xmm1,%xmm1 + + vaesenc 144(%rdi),%xmm1,%xmm1 + + vaesenclast 160(%rdi),%xmm1,%xmm1 + jmp .Lexit_aes_wbuuzwjyGbjeaox +.align 32 +.Laes_192_wbuuzwjyGbjeaox: + vpxorq 0(%rdi),%xmm1,%xmm1 + + vaesenc 16(%rdi),%xmm1,%xmm1 + + vaesenc 32(%rdi),%xmm1,%xmm1 + + vaesenc 48(%rdi),%xmm1,%xmm1 + + vaesenc 64(%rdi),%xmm1,%xmm1 + + vaesenc 80(%rdi),%xmm1,%xmm1 + + vaesenc 96(%rdi),%xmm1,%xmm1 + + vaesenc 112(%rdi),%xmm1,%xmm1 + + vaesenc 128(%rdi),%xmm1,%xmm1 + + vaesenc 144(%rdi),%xmm1,%xmm1 + + vaesenc 160(%rdi),%xmm1,%xmm1 + + vaesenc 176(%rdi),%xmm1,%xmm1 + + vaesenclast 192(%rdi),%xmm1,%xmm1 + jmp .Lexit_aes_wbuuzwjyGbjeaox +.align 32 +.Laes_256_wbuuzwjyGbjeaox: + vpxorq 0(%rdi),%xmm1,%xmm1 + + vaesenc 16(%rdi),%xmm1,%xmm1 + + vaesenc 32(%rdi),%xmm1,%xmm1 + + vaesenc 48(%rdi),%xmm1,%xmm1 + + vaesenc 64(%rdi),%xmm1,%xmm1 + + vaesenc 80(%rdi),%xmm1,%xmm1 + + vaesenc 96(%rdi),%xmm1,%xmm1 + + vaesenc 112(%rdi),%xmm1,%xmm1 + + vaesenc 128(%rdi),%xmm1,%xmm1 + + vaesenc 144(%rdi),%xmm1,%xmm1 + + vaesenc 160(%rdi),%xmm1,%xmm1 + + vaesenc 176(%rdi),%xmm1,%xmm1 + + vaesenc 192(%rdi),%xmm1,%xmm1 + + vaesenc 208(%rdi),%xmm1,%xmm1 + + vaesenclast 224(%rdi),%xmm1,%xmm1 + jmp .Lexit_aes_wbuuzwjyGbjeaox +.Lexit_aes_wbuuzwjyGbjeaox: + + vmovdqu %xmm1,32(%rsi) + + + vpshufb SHUF_MASK(%rip),%xmm2,%xmm2 + vmovdqu %xmm2,0(%rsi) + cmpq $256,%rcx + jbe .Lskip_hkeys_cleanup_pseltoyDnFwppqb + vpxor %xmm0,%xmm0,%xmm0 + vmovdqa64 %zmm0,0(%rsp) + vmovdqa64 %zmm0,64(%rsp) + vmovdqa64 %zmm0,128(%rsp) + vmovdqa64 %zmm0,192(%rsp) + vmovdqa64 %zmm0,256(%rsp) + vmovdqa64 %zmm0,320(%rsp) + vmovdqa64 %zmm0,384(%rsp) + vmovdqa64 %zmm0,448(%rsp) + vmovdqa64 %zmm0,512(%rsp) + vmovdqa64 %zmm0,576(%rsp) + vmovdqa64 %zmm0,640(%rsp) + vmovdqa64 %zmm0,704(%rsp) +.Lskip_hkeys_cleanup_pseltoyDnFwppqb: + vzeroupper + leaq (%rbp),%rsp +.cfi_def_cfa_register %rsp + popq %r15 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r15 + popq %r14 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r14 + popq %r13 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r13 + popq %r12 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r12 + popq %rbp +.cfi_adjust_cfa_offset -8 +.cfi_restore %rbp + popq %rbx +.cfi_adjust_cfa_offset -8 +.cfi_restore %rbx +.Labort_setiv: + .byte 0xf3,0xc3 +.Lsetiv_seh_end: +.cfi_endproc +.size ossl_aes_gcm_setiv_avx512, .-ossl_aes_gcm_setiv_avx512 +.globl ossl_aes_gcm_update_aad_avx512 +.type ossl_aes_gcm_update_aad_avx512,@function +.align 32 +ossl_aes_gcm_update_aad_avx512: +.cfi_startproc +.Lghash_seh_begin: +.byte 243,15,30,250 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 +.Lghash_seh_push_rbx: + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 +.Lghash_seh_push_rbp: + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 +.Lghash_seh_push_r12: + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 +.Lghash_seh_push_r13: + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 +.Lghash_seh_push_r14: + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 +.Lghash_seh_push_r15: + + + + + + + + + + + leaq 0(%rsp),%rbp +.cfi_def_cfa_register %rbp +.Lghash_seh_setfp: + +.Lghash_seh_prolog_end: + subq $820,%rsp + andq $(-64),%rsp + vmovdqu64 64(%rdi),%xmm14 + movq %rsi,%r10 + movq %rdx,%r11 + orq %r11,%r11 + jz .L_CALC_AAD_done_ijFECAxDcrvrgja + + xorq %rbx,%rbx + vmovdqa64 SHUF_MASK(%rip),%zmm16 + +.L_get_AAD_loop48x16_ijFECAxDcrvrgja: + cmpq $768,%r11 + jl .L_exit_AAD_loop48x16_ijFECAxDcrvrgja + vmovdqu64 0(%r10),%zmm11 + vmovdqu64 64(%r10),%zmm3 + vmovdqu64 128(%r10),%zmm4 + vmovdqu64 192(%r10),%zmm5 + vpshufb %zmm16,%zmm11,%zmm11 + vpshufb %zmm16,%zmm3,%zmm3 + vpshufb %zmm16,%zmm4,%zmm4 + vpshufb %zmm16,%zmm5,%zmm5 + testq %rbx,%rbx + jnz .L_skip_hkeys_precomputation_AfEjmfnrFdFcycC + + vmovdqu64 288(%rdi),%zmm1 + vmovdqu64 %zmm1,704(%rsp) + + vmovdqu64 224(%rdi),%zmm9 + vmovdqu64 %zmm9,640(%rsp) + + + vshufi64x2 $0x00,%zmm9,%zmm9,%zmm9 + + vmovdqu64 160(%rdi),%zmm10 + vmovdqu64 %zmm10,576(%rsp) + + vmovdqu64 96(%rdi),%zmm12 + vmovdqu64 %zmm12,512(%rsp) + + vpclmulqdq $0x11,%zmm9,%zmm10,%zmm13 + vpclmulqdq $0x00,%zmm9,%zmm10,%zmm15 + vpclmulqdq $0x01,%zmm9,%zmm10,%zmm17 + vpclmulqdq $0x10,%zmm9,%zmm10,%zmm10 + vpxorq %zmm17,%zmm10,%zmm10 + + vpsrldq $8,%zmm10,%zmm17 + vpslldq $8,%zmm10,%zmm10 + vpxorq %zmm17,%zmm13,%zmm13 + vpxorq %zmm15,%zmm10,%zmm10 + + + + vmovdqu64 POLY2(%rip),%zmm17 + + vpclmulqdq $0x01,%zmm10,%zmm17,%zmm15 + vpslldq $8,%zmm15,%zmm15 + vpxorq %zmm15,%zmm10,%zmm10 + + + + vpclmulqdq $0x00,%zmm10,%zmm17,%zmm15 + vpsrldq $4,%zmm15,%zmm15 + vpclmulqdq $0x10,%zmm10,%zmm17,%zmm10 + vpslldq $4,%zmm10,%zmm10 + + vpternlogq $0x96,%zmm15,%zmm13,%zmm10 + + vmovdqu64 %zmm10,448(%rsp) + + vpclmulqdq $0x11,%zmm9,%zmm12,%zmm13 + vpclmulqdq $0x00,%zmm9,%zmm12,%zmm15 + vpclmulqdq $0x01,%zmm9,%zmm12,%zmm17 + vpclmulqdq $0x10,%zmm9,%zmm12,%zmm12 + vpxorq %zmm17,%zmm12,%zmm12 + + vpsrldq $8,%zmm12,%zmm17 + vpslldq $8,%zmm12,%zmm12 + vpxorq %zmm17,%zmm13,%zmm13 + vpxorq %zmm15,%zmm12,%zmm12 + + + + vmovdqu64 POLY2(%rip),%zmm17 + + vpclmulqdq $0x01,%zmm12,%zmm17,%zmm15 + vpslldq $8,%zmm15,%zmm15 + vpxorq %zmm15,%zmm12,%zmm12 + + + + vpclmulqdq $0x00,%zmm12,%zmm17,%zmm15 + vpsrldq $4,%zmm15,%zmm15 + vpclmulqdq $0x10,%zmm12,%zmm17,%zmm12 + vpslldq $4,%zmm12,%zmm12 + + vpternlogq $0x96,%zmm15,%zmm13,%zmm12 + + vmovdqu64 %zmm12,384(%rsp) + + vpclmulqdq $0x11,%zmm9,%zmm10,%zmm13 + vpclmulqdq $0x00,%zmm9,%zmm10,%zmm15 + vpclmulqdq $0x01,%zmm9,%zmm10,%zmm17 + vpclmulqdq $0x10,%zmm9,%zmm10,%zmm10 + vpxorq %zmm17,%zmm10,%zmm10 + + vpsrldq $8,%zmm10,%zmm17 + vpslldq $8,%zmm10,%zmm10 + vpxorq %zmm17,%zmm13,%zmm13 + vpxorq %zmm15,%zmm10,%zmm10 + + + + vmovdqu64 POLY2(%rip),%zmm17 + + vpclmulqdq $0x01,%zmm10,%zmm17,%zmm15 + vpslldq $8,%zmm15,%zmm15 + vpxorq %zmm15,%zmm10,%zmm10 + + + + vpclmulqdq $0x00,%zmm10,%zmm17,%zmm15 + vpsrldq $4,%zmm15,%zmm15 + vpclmulqdq $0x10,%zmm10,%zmm17,%zmm10 + vpslldq $4,%zmm10,%zmm10 + + vpternlogq $0x96,%zmm15,%zmm13,%zmm10 + + vmovdqu64 %zmm10,320(%rsp) + + vpclmulqdq $0x11,%zmm9,%zmm12,%zmm13 + vpclmulqdq $0x00,%zmm9,%zmm12,%zmm15 + vpclmulqdq $0x01,%zmm9,%zmm12,%zmm17 + vpclmulqdq $0x10,%zmm9,%zmm12,%zmm12 + vpxorq %zmm17,%zmm12,%zmm12 + + vpsrldq $8,%zmm12,%zmm17 + vpslldq $8,%zmm12,%zmm12 + vpxorq %zmm17,%zmm13,%zmm13 + vpxorq %zmm15,%zmm12,%zmm12 + + + + vmovdqu64 POLY2(%rip),%zmm17 + + vpclmulqdq $0x01,%zmm12,%zmm17,%zmm15 + vpslldq $8,%zmm15,%zmm15 + vpxorq %zmm15,%zmm12,%zmm12 + + + + vpclmulqdq $0x00,%zmm12,%zmm17,%zmm15 + vpsrldq $4,%zmm15,%zmm15 + vpclmulqdq $0x10,%zmm12,%zmm17,%zmm12 + vpslldq $4,%zmm12,%zmm12 + + vpternlogq $0x96,%zmm15,%zmm13,%zmm12 + + vmovdqu64 %zmm12,256(%rsp) + + vpclmulqdq $0x11,%zmm9,%zmm10,%zmm13 + vpclmulqdq $0x00,%zmm9,%zmm10,%zmm15 + vpclmulqdq $0x01,%zmm9,%zmm10,%zmm17 + vpclmulqdq $0x10,%zmm9,%zmm10,%zmm10 + vpxorq %zmm17,%zmm10,%zmm10 + + vpsrldq $8,%zmm10,%zmm17 + vpslldq $8,%zmm10,%zmm10 + vpxorq %zmm17,%zmm13,%zmm13 + vpxorq %zmm15,%zmm10,%zmm10 + + + + vmovdqu64 POLY2(%rip),%zmm17 + + vpclmulqdq $0x01,%zmm10,%zmm17,%zmm15 + vpslldq $8,%zmm15,%zmm15 + vpxorq %zmm15,%zmm10,%zmm10 + + + + vpclmulqdq $0x00,%zmm10,%zmm17,%zmm15 + vpsrldq $4,%zmm15,%zmm15 + vpclmulqdq $0x10,%zmm10,%zmm17,%zmm10 + vpslldq $4,%zmm10,%zmm10 + + vpternlogq $0x96,%zmm15,%zmm13,%zmm10 + + vmovdqu64 %zmm10,192(%rsp) + + vpclmulqdq $0x11,%zmm9,%zmm12,%zmm13 + vpclmulqdq $0x00,%zmm9,%zmm12,%zmm15 + vpclmulqdq $0x01,%zmm9,%zmm12,%zmm17 + vpclmulqdq $0x10,%zmm9,%zmm12,%zmm12 + vpxorq %zmm17,%zmm12,%zmm12 + + vpsrldq $8,%zmm12,%zmm17 + vpslldq $8,%zmm12,%zmm12 + vpxorq %zmm17,%zmm13,%zmm13 + vpxorq %zmm15,%zmm12,%zmm12 + + + + vmovdqu64 POLY2(%rip),%zmm17 + + vpclmulqdq $0x01,%zmm12,%zmm17,%zmm15 + vpslldq $8,%zmm15,%zmm15 + vpxorq %zmm15,%zmm12,%zmm12 + + + + vpclmulqdq $0x00,%zmm12,%zmm17,%zmm15 + vpsrldq $4,%zmm15,%zmm15 + vpclmulqdq $0x10,%zmm12,%zmm17,%zmm12 + vpslldq $4,%zmm12,%zmm12 + + vpternlogq $0x96,%zmm15,%zmm13,%zmm12 + + vmovdqu64 %zmm12,128(%rsp) + + vpclmulqdq $0x11,%zmm9,%zmm10,%zmm13 + vpclmulqdq $0x00,%zmm9,%zmm10,%zmm15 + vpclmulqdq $0x01,%zmm9,%zmm10,%zmm17 + vpclmulqdq $0x10,%zmm9,%zmm10,%zmm10 + vpxorq %zmm17,%zmm10,%zmm10 + + vpsrldq $8,%zmm10,%zmm17 + vpslldq $8,%zmm10,%zmm10 + vpxorq %zmm17,%zmm13,%zmm13 + vpxorq %zmm15,%zmm10,%zmm10 + + + + vmovdqu64 POLY2(%rip),%zmm17 + + vpclmulqdq $0x01,%zmm10,%zmm17,%zmm15 + vpslldq $8,%zmm15,%zmm15 + vpxorq %zmm15,%zmm10,%zmm10 + + + + vpclmulqdq $0x00,%zmm10,%zmm17,%zmm15 + vpsrldq $4,%zmm15,%zmm15 + vpclmulqdq $0x10,%zmm10,%zmm17,%zmm10 + vpslldq $4,%zmm10,%zmm10 + + vpternlogq $0x96,%zmm15,%zmm13,%zmm10 + + vmovdqu64 %zmm10,64(%rsp) + + vpclmulqdq $0x11,%zmm9,%zmm12,%zmm13 + vpclmulqdq $0x00,%zmm9,%zmm12,%zmm15 + vpclmulqdq $0x01,%zmm9,%zmm12,%zmm17 + vpclmulqdq $0x10,%zmm9,%zmm12,%zmm12 + vpxorq %zmm17,%zmm12,%zmm12 + + vpsrldq $8,%zmm12,%zmm17 + vpslldq $8,%zmm12,%zmm12 + vpxorq %zmm17,%zmm13,%zmm13 + vpxorq %zmm15,%zmm12,%zmm12 + + + + vmovdqu64 POLY2(%rip),%zmm17 + + vpclmulqdq $0x01,%zmm12,%zmm17,%zmm15 + vpslldq $8,%zmm15,%zmm15 + vpxorq %zmm15,%zmm12,%zmm12 + + + + vpclmulqdq $0x00,%zmm12,%zmm17,%zmm15 + vpsrldq $4,%zmm15,%zmm15 + vpclmulqdq $0x10,%zmm12,%zmm17,%zmm12 + vpslldq $4,%zmm12,%zmm12 + + vpternlogq $0x96,%zmm15,%zmm13,%zmm12 + + vmovdqu64 %zmm12,0(%rsp) +.L_skip_hkeys_precomputation_AfEjmfnrFdFcycC: + movq $1,%rbx + vpxorq %zmm14,%zmm11,%zmm11 + vmovdqu64 0(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm11,%zmm1 + vpclmulqdq $0x00,%zmm19,%zmm11,%zmm9 + vpclmulqdq $0x01,%zmm19,%zmm11,%zmm10 + vpclmulqdq $0x10,%zmm19,%zmm11,%zmm12 + vmovdqu64 64(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm3,%zmm13 + vpclmulqdq $0x00,%zmm19,%zmm3,%zmm15 + vpclmulqdq $0x01,%zmm19,%zmm3,%zmm17 + vpclmulqdq $0x10,%zmm19,%zmm3,%zmm18 + vpxorq %zmm17,%zmm10,%zmm7 + vpxorq %zmm13,%zmm1,%zmm6 + vpxorq %zmm15,%zmm9,%zmm8 + vpternlogq $0x96,%zmm18,%zmm12,%zmm7 + vmovdqu64 128(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm4,%zmm1 + vpclmulqdq $0x00,%zmm19,%zmm4,%zmm9 + vpclmulqdq $0x01,%zmm19,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm19,%zmm4,%zmm12 + vmovdqu64 192(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm5,%zmm13 + vpclmulqdq $0x00,%zmm19,%zmm5,%zmm15 + vpclmulqdq $0x01,%zmm19,%zmm5,%zmm17 + vpclmulqdq $0x10,%zmm19,%zmm5,%zmm18 + + vpternlogq $0x96,%zmm17,%zmm10,%zmm7 + vpternlogq $0x96,%zmm13,%zmm1,%zmm6 + vpternlogq $0x96,%zmm15,%zmm9,%zmm8 + vpternlogq $0x96,%zmm18,%zmm12,%zmm7 + vmovdqu64 256(%r10),%zmm11 + vmovdqu64 320(%r10),%zmm3 + vmovdqu64 384(%r10),%zmm4 + vmovdqu64 448(%r10),%zmm5 + vpshufb %zmm16,%zmm11,%zmm11 + vpshufb %zmm16,%zmm3,%zmm3 + vpshufb %zmm16,%zmm4,%zmm4 + vpshufb %zmm16,%zmm5,%zmm5 + vmovdqu64 256(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm11,%zmm1 + vpclmulqdq $0x00,%zmm19,%zmm11,%zmm9 + vpclmulqdq $0x01,%zmm19,%zmm11,%zmm10 + vpclmulqdq $0x10,%zmm19,%zmm11,%zmm12 + vmovdqu64 320(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm3,%zmm13 + vpclmulqdq $0x00,%zmm19,%zmm3,%zmm15 + vpclmulqdq $0x01,%zmm19,%zmm3,%zmm17 + vpclmulqdq $0x10,%zmm19,%zmm3,%zmm18 + vpternlogq $0x96,%zmm17,%zmm10,%zmm7 + vpternlogq $0x96,%zmm13,%zmm1,%zmm6 + vpternlogq $0x96,%zmm15,%zmm9,%zmm8 + vpternlogq $0x96,%zmm18,%zmm12,%zmm7 + vmovdqu64 384(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm4,%zmm1 + vpclmulqdq $0x00,%zmm19,%zmm4,%zmm9 + vpclmulqdq $0x01,%zmm19,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm19,%zmm4,%zmm12 + vmovdqu64 448(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm5,%zmm13 + vpclmulqdq $0x00,%zmm19,%zmm5,%zmm15 + vpclmulqdq $0x01,%zmm19,%zmm5,%zmm17 + vpclmulqdq $0x10,%zmm19,%zmm5,%zmm18 + + vpternlogq $0x96,%zmm17,%zmm10,%zmm7 + vpternlogq $0x96,%zmm13,%zmm1,%zmm6 + vpternlogq $0x96,%zmm15,%zmm9,%zmm8 + vpternlogq $0x96,%zmm18,%zmm12,%zmm7 + vmovdqu64 512(%r10),%zmm11 + vmovdqu64 576(%r10),%zmm3 + vmovdqu64 640(%r10),%zmm4 + vmovdqu64 704(%r10),%zmm5 + vpshufb %zmm16,%zmm11,%zmm11 + vpshufb %zmm16,%zmm3,%zmm3 + vpshufb %zmm16,%zmm4,%zmm4 + vpshufb %zmm16,%zmm5,%zmm5 + vmovdqu64 512(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm11,%zmm1 + vpclmulqdq $0x00,%zmm19,%zmm11,%zmm9 + vpclmulqdq $0x01,%zmm19,%zmm11,%zmm10 + vpclmulqdq $0x10,%zmm19,%zmm11,%zmm12 + vmovdqu64 576(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm3,%zmm13 + vpclmulqdq $0x00,%zmm19,%zmm3,%zmm15 + vpclmulqdq $0x01,%zmm19,%zmm3,%zmm17 + vpclmulqdq $0x10,%zmm19,%zmm3,%zmm18 + vpternlogq $0x96,%zmm17,%zmm10,%zmm7 + vpternlogq $0x96,%zmm13,%zmm1,%zmm6 + vpternlogq $0x96,%zmm15,%zmm9,%zmm8 + vpternlogq $0x96,%zmm18,%zmm12,%zmm7 + vmovdqu64 640(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm4,%zmm1 + vpclmulqdq $0x00,%zmm19,%zmm4,%zmm9 + vpclmulqdq $0x01,%zmm19,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm19,%zmm4,%zmm12 + vmovdqu64 704(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm5,%zmm13 + vpclmulqdq $0x00,%zmm19,%zmm5,%zmm15 + vpclmulqdq $0x01,%zmm19,%zmm5,%zmm17 + vpclmulqdq $0x10,%zmm19,%zmm5,%zmm18 + + vpternlogq $0x96,%zmm17,%zmm10,%zmm7 + vpternlogq $0x96,%zmm13,%zmm1,%zmm6 + vpternlogq $0x96,%zmm15,%zmm9,%zmm8 + vpternlogq $0x96,%zmm18,%zmm12,%zmm7 + + vpsrldq $8,%zmm7,%zmm1 + vpslldq $8,%zmm7,%zmm9 + vpxorq %zmm1,%zmm6,%zmm6 + vpxorq %zmm9,%zmm8,%zmm8 + vextracti64x4 $1,%zmm6,%ymm1 + vpxorq %ymm1,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm1 + vpxorq %xmm1,%xmm6,%xmm6 + vextracti64x4 $1,%zmm8,%ymm9 + vpxorq %ymm9,%ymm8,%ymm8 + vextracti32x4 $1,%ymm8,%xmm9 + vpxorq %xmm9,%xmm8,%xmm8 + vmovdqa64 POLY2(%rip),%xmm10 + + + vpclmulqdq $0x01,%xmm8,%xmm10,%xmm1 + vpslldq $8,%xmm1,%xmm1 + vpxorq %xmm1,%xmm8,%xmm1 + + + vpclmulqdq $0x00,%xmm1,%xmm10,%xmm9 + vpsrldq $4,%xmm9,%xmm9 + vpclmulqdq $0x10,%xmm1,%xmm10,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm6,%xmm9,%xmm14 + + subq $768,%r11 + je .L_CALC_AAD_done_ijFECAxDcrvrgja + + addq $768,%r10 + jmp .L_get_AAD_loop48x16_ijFECAxDcrvrgja + +.L_exit_AAD_loop48x16_ijFECAxDcrvrgja: + + cmpq $512,%r11 + jl .L_less_than_32x16_ijFECAxDcrvrgja + + vmovdqu64 0(%r10),%zmm11 + vmovdqu64 64(%r10),%zmm3 + vmovdqu64 128(%r10),%zmm4 + vmovdqu64 192(%r10),%zmm5 + vpshufb %zmm16,%zmm11,%zmm11 + vpshufb %zmm16,%zmm3,%zmm3 + vpshufb %zmm16,%zmm4,%zmm4 + vpshufb %zmm16,%zmm5,%zmm5 + testq %rbx,%rbx + jnz .L_skip_hkeys_precomputation_kvsjACAeAekBEdd + + vmovdqu64 288(%rdi),%zmm1 + vmovdqu64 %zmm1,704(%rsp) + + vmovdqu64 224(%rdi),%zmm9 + vmovdqu64 %zmm9,640(%rsp) + + + vshufi64x2 $0x00,%zmm9,%zmm9,%zmm9 + + vmovdqu64 160(%rdi),%zmm10 + vmovdqu64 %zmm10,576(%rsp) + + vmovdqu64 96(%rdi),%zmm12 + vmovdqu64 %zmm12,512(%rsp) + + vpclmulqdq $0x11,%zmm9,%zmm10,%zmm13 + vpclmulqdq $0x00,%zmm9,%zmm10,%zmm15 + vpclmulqdq $0x01,%zmm9,%zmm10,%zmm17 + vpclmulqdq $0x10,%zmm9,%zmm10,%zmm10 + vpxorq %zmm17,%zmm10,%zmm10 + + vpsrldq $8,%zmm10,%zmm17 + vpslldq $8,%zmm10,%zmm10 + vpxorq %zmm17,%zmm13,%zmm13 + vpxorq %zmm15,%zmm10,%zmm10 + + + + vmovdqu64 POLY2(%rip),%zmm17 + + vpclmulqdq $0x01,%zmm10,%zmm17,%zmm15 + vpslldq $8,%zmm15,%zmm15 + vpxorq %zmm15,%zmm10,%zmm10 + + + + vpclmulqdq $0x00,%zmm10,%zmm17,%zmm15 + vpsrldq $4,%zmm15,%zmm15 + vpclmulqdq $0x10,%zmm10,%zmm17,%zmm10 + vpslldq $4,%zmm10,%zmm10 + + vpternlogq $0x96,%zmm15,%zmm13,%zmm10 + + vmovdqu64 %zmm10,448(%rsp) + + vpclmulqdq $0x11,%zmm9,%zmm12,%zmm13 + vpclmulqdq $0x00,%zmm9,%zmm12,%zmm15 + vpclmulqdq $0x01,%zmm9,%zmm12,%zmm17 + vpclmulqdq $0x10,%zmm9,%zmm12,%zmm12 + vpxorq %zmm17,%zmm12,%zmm12 + + vpsrldq $8,%zmm12,%zmm17 + vpslldq $8,%zmm12,%zmm12 + vpxorq %zmm17,%zmm13,%zmm13 + vpxorq %zmm15,%zmm12,%zmm12 + + + + vmovdqu64 POLY2(%rip),%zmm17 + + vpclmulqdq $0x01,%zmm12,%zmm17,%zmm15 + vpslldq $8,%zmm15,%zmm15 + vpxorq %zmm15,%zmm12,%zmm12 + + + + vpclmulqdq $0x00,%zmm12,%zmm17,%zmm15 + vpsrldq $4,%zmm15,%zmm15 + vpclmulqdq $0x10,%zmm12,%zmm17,%zmm12 + vpslldq $4,%zmm12,%zmm12 + + vpternlogq $0x96,%zmm15,%zmm13,%zmm12 + + vmovdqu64 %zmm12,384(%rsp) + + vpclmulqdq $0x11,%zmm9,%zmm10,%zmm13 + vpclmulqdq $0x00,%zmm9,%zmm10,%zmm15 + vpclmulqdq $0x01,%zmm9,%zmm10,%zmm17 + vpclmulqdq $0x10,%zmm9,%zmm10,%zmm10 + vpxorq %zmm17,%zmm10,%zmm10 + + vpsrldq $8,%zmm10,%zmm17 + vpslldq $8,%zmm10,%zmm10 + vpxorq %zmm17,%zmm13,%zmm13 + vpxorq %zmm15,%zmm10,%zmm10 + + + + vmovdqu64 POLY2(%rip),%zmm17 + + vpclmulqdq $0x01,%zmm10,%zmm17,%zmm15 + vpslldq $8,%zmm15,%zmm15 + vpxorq %zmm15,%zmm10,%zmm10 + + + + vpclmulqdq $0x00,%zmm10,%zmm17,%zmm15 + vpsrldq $4,%zmm15,%zmm15 + vpclmulqdq $0x10,%zmm10,%zmm17,%zmm10 + vpslldq $4,%zmm10,%zmm10 + + vpternlogq $0x96,%zmm15,%zmm13,%zmm10 + + vmovdqu64 %zmm10,320(%rsp) + + vpclmulqdq $0x11,%zmm9,%zmm12,%zmm13 + vpclmulqdq $0x00,%zmm9,%zmm12,%zmm15 + vpclmulqdq $0x01,%zmm9,%zmm12,%zmm17 + vpclmulqdq $0x10,%zmm9,%zmm12,%zmm12 + vpxorq %zmm17,%zmm12,%zmm12 + + vpsrldq $8,%zmm12,%zmm17 + vpslldq $8,%zmm12,%zmm12 + vpxorq %zmm17,%zmm13,%zmm13 + vpxorq %zmm15,%zmm12,%zmm12 + + + + vmovdqu64 POLY2(%rip),%zmm17 + + vpclmulqdq $0x01,%zmm12,%zmm17,%zmm15 + vpslldq $8,%zmm15,%zmm15 + vpxorq %zmm15,%zmm12,%zmm12 + + + + vpclmulqdq $0x00,%zmm12,%zmm17,%zmm15 + vpsrldq $4,%zmm15,%zmm15 + vpclmulqdq $0x10,%zmm12,%zmm17,%zmm12 + vpslldq $4,%zmm12,%zmm12 + + vpternlogq $0x96,%zmm15,%zmm13,%zmm12 + + vmovdqu64 %zmm12,256(%rsp) +.L_skip_hkeys_precomputation_kvsjACAeAekBEdd: + movq $1,%rbx + vpxorq %zmm14,%zmm11,%zmm11 + vmovdqu64 256(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm11,%zmm1 + vpclmulqdq $0x00,%zmm19,%zmm11,%zmm9 + vpclmulqdq $0x01,%zmm19,%zmm11,%zmm10 + vpclmulqdq $0x10,%zmm19,%zmm11,%zmm12 + vmovdqu64 320(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm3,%zmm13 + vpclmulqdq $0x00,%zmm19,%zmm3,%zmm15 + vpclmulqdq $0x01,%zmm19,%zmm3,%zmm17 + vpclmulqdq $0x10,%zmm19,%zmm3,%zmm18 + vpxorq %zmm17,%zmm10,%zmm7 + vpxorq %zmm13,%zmm1,%zmm6 + vpxorq %zmm15,%zmm9,%zmm8 + vpternlogq $0x96,%zmm18,%zmm12,%zmm7 + vmovdqu64 384(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm4,%zmm1 + vpclmulqdq $0x00,%zmm19,%zmm4,%zmm9 + vpclmulqdq $0x01,%zmm19,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm19,%zmm4,%zmm12 + vmovdqu64 448(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm5,%zmm13 + vpclmulqdq $0x00,%zmm19,%zmm5,%zmm15 + vpclmulqdq $0x01,%zmm19,%zmm5,%zmm17 + vpclmulqdq $0x10,%zmm19,%zmm5,%zmm18 + + vpternlogq $0x96,%zmm17,%zmm10,%zmm7 + vpternlogq $0x96,%zmm13,%zmm1,%zmm6 + vpternlogq $0x96,%zmm15,%zmm9,%zmm8 + vpternlogq $0x96,%zmm18,%zmm12,%zmm7 + vmovdqu64 256(%r10),%zmm11 + vmovdqu64 320(%r10),%zmm3 + vmovdqu64 384(%r10),%zmm4 + vmovdqu64 448(%r10),%zmm5 + vpshufb %zmm16,%zmm11,%zmm11 + vpshufb %zmm16,%zmm3,%zmm3 + vpshufb %zmm16,%zmm4,%zmm4 + vpshufb %zmm16,%zmm5,%zmm5 + vmovdqu64 512(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm11,%zmm1 + vpclmulqdq $0x00,%zmm19,%zmm11,%zmm9 + vpclmulqdq $0x01,%zmm19,%zmm11,%zmm10 + vpclmulqdq $0x10,%zmm19,%zmm11,%zmm12 + vmovdqu64 576(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm3,%zmm13 + vpclmulqdq $0x00,%zmm19,%zmm3,%zmm15 + vpclmulqdq $0x01,%zmm19,%zmm3,%zmm17 + vpclmulqdq $0x10,%zmm19,%zmm3,%zmm18 + vpternlogq $0x96,%zmm17,%zmm10,%zmm7 + vpternlogq $0x96,%zmm13,%zmm1,%zmm6 + vpternlogq $0x96,%zmm15,%zmm9,%zmm8 + vpternlogq $0x96,%zmm18,%zmm12,%zmm7 + vmovdqu64 640(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm4,%zmm1 + vpclmulqdq $0x00,%zmm19,%zmm4,%zmm9 + vpclmulqdq $0x01,%zmm19,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm19,%zmm4,%zmm12 + vmovdqu64 704(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm5,%zmm13 + vpclmulqdq $0x00,%zmm19,%zmm5,%zmm15 + vpclmulqdq $0x01,%zmm19,%zmm5,%zmm17 + vpclmulqdq $0x10,%zmm19,%zmm5,%zmm18 + + vpternlogq $0x96,%zmm17,%zmm10,%zmm7 + vpternlogq $0x96,%zmm13,%zmm1,%zmm6 + vpternlogq $0x96,%zmm15,%zmm9,%zmm8 + vpternlogq $0x96,%zmm18,%zmm12,%zmm7 + + vpsrldq $8,%zmm7,%zmm1 + vpslldq $8,%zmm7,%zmm9 + vpxorq %zmm1,%zmm6,%zmm6 + vpxorq %zmm9,%zmm8,%zmm8 + vextracti64x4 $1,%zmm6,%ymm1 + vpxorq %ymm1,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm1 + vpxorq %xmm1,%xmm6,%xmm6 + vextracti64x4 $1,%zmm8,%ymm9 + vpxorq %ymm9,%ymm8,%ymm8 + vextracti32x4 $1,%ymm8,%xmm9 + vpxorq %xmm9,%xmm8,%xmm8 + vmovdqa64 POLY2(%rip),%xmm10 + + + vpclmulqdq $0x01,%xmm8,%xmm10,%xmm1 + vpslldq $8,%xmm1,%xmm1 + vpxorq %xmm1,%xmm8,%xmm1 + + + vpclmulqdq $0x00,%xmm1,%xmm10,%xmm9 + vpsrldq $4,%xmm9,%xmm9 + vpclmulqdq $0x10,%xmm1,%xmm10,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm6,%xmm9,%xmm14 + + subq $512,%r11 + je .L_CALC_AAD_done_ijFECAxDcrvrgja + + addq $512,%r10 + jmp .L_less_than_16x16_ijFECAxDcrvrgja + +.L_less_than_32x16_ijFECAxDcrvrgja: + cmpq $256,%r11 + jl .L_less_than_16x16_ijFECAxDcrvrgja + + vmovdqu64 0(%r10),%zmm11 + vmovdqu64 64(%r10),%zmm3 + vmovdqu64 128(%r10),%zmm4 + vmovdqu64 192(%r10),%zmm5 + vpshufb %zmm16,%zmm11,%zmm11 + vpshufb %zmm16,%zmm3,%zmm3 + vpshufb %zmm16,%zmm4,%zmm4 + vpshufb %zmm16,%zmm5,%zmm5 + vpxorq %zmm14,%zmm11,%zmm11 + vmovdqu64 96(%rdi),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm11,%zmm1 + vpclmulqdq $0x00,%zmm19,%zmm11,%zmm9 + vpclmulqdq $0x01,%zmm19,%zmm11,%zmm10 + vpclmulqdq $0x10,%zmm19,%zmm11,%zmm12 + vmovdqu64 160(%rdi),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm3,%zmm13 + vpclmulqdq $0x00,%zmm19,%zmm3,%zmm15 + vpclmulqdq $0x01,%zmm19,%zmm3,%zmm17 + vpclmulqdq $0x10,%zmm19,%zmm3,%zmm18 + vpxorq %zmm17,%zmm10,%zmm7 + vpxorq %zmm13,%zmm1,%zmm6 + vpxorq %zmm15,%zmm9,%zmm8 + vpternlogq $0x96,%zmm18,%zmm12,%zmm7 + vmovdqu64 224(%rdi),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm4,%zmm1 + vpclmulqdq $0x00,%zmm19,%zmm4,%zmm9 + vpclmulqdq $0x01,%zmm19,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm19,%zmm4,%zmm12 + vmovdqu64 288(%rdi),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm5,%zmm13 + vpclmulqdq $0x00,%zmm19,%zmm5,%zmm15 + vpclmulqdq $0x01,%zmm19,%zmm5,%zmm17 + vpclmulqdq $0x10,%zmm19,%zmm5,%zmm18 + + vpternlogq $0x96,%zmm17,%zmm10,%zmm7 + vpternlogq $0x96,%zmm13,%zmm1,%zmm6 + vpternlogq $0x96,%zmm15,%zmm9,%zmm8 + vpternlogq $0x96,%zmm18,%zmm12,%zmm7 + + vpsrldq $8,%zmm7,%zmm1 + vpslldq $8,%zmm7,%zmm9 + vpxorq %zmm1,%zmm6,%zmm6 + vpxorq %zmm9,%zmm8,%zmm8 + vextracti64x4 $1,%zmm6,%ymm1 + vpxorq %ymm1,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm1 + vpxorq %xmm1,%xmm6,%xmm6 + vextracti64x4 $1,%zmm8,%ymm9 + vpxorq %ymm9,%ymm8,%ymm8 + vextracti32x4 $1,%ymm8,%xmm9 + vpxorq %xmm9,%xmm8,%xmm8 + vmovdqa64 POLY2(%rip),%xmm10 + + + vpclmulqdq $0x01,%xmm8,%xmm10,%xmm1 + vpslldq $8,%xmm1,%xmm1 + vpxorq %xmm1,%xmm8,%xmm1 + + + vpclmulqdq $0x00,%xmm1,%xmm10,%xmm9 + vpsrldq $4,%xmm9,%xmm9 + vpclmulqdq $0x10,%xmm1,%xmm10,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm6,%xmm9,%xmm14 + + subq $256,%r11 + je .L_CALC_AAD_done_ijFECAxDcrvrgja + + addq $256,%r10 + +.L_less_than_16x16_ijFECAxDcrvrgja: + + leaq byte64_len_to_mask_table(%rip),%r12 + leaq (%r12,%r11,8),%r12 + + + addl $15,%r11d + shrl $4,%r11d + cmpl $2,%r11d + jb .L_AAD_blocks_1_ijFECAxDcrvrgja + je .L_AAD_blocks_2_ijFECAxDcrvrgja + cmpl $4,%r11d + jb .L_AAD_blocks_3_ijFECAxDcrvrgja + je .L_AAD_blocks_4_ijFECAxDcrvrgja + cmpl $6,%r11d + jb .L_AAD_blocks_5_ijFECAxDcrvrgja + je .L_AAD_blocks_6_ijFECAxDcrvrgja + cmpl $8,%r11d + jb .L_AAD_blocks_7_ijFECAxDcrvrgja + je .L_AAD_blocks_8_ijFECAxDcrvrgja + cmpl $10,%r11d + jb .L_AAD_blocks_9_ijFECAxDcrvrgja + je .L_AAD_blocks_10_ijFECAxDcrvrgja + cmpl $12,%r11d + jb .L_AAD_blocks_11_ijFECAxDcrvrgja + je .L_AAD_blocks_12_ijFECAxDcrvrgja + cmpl $14,%r11d + jb .L_AAD_blocks_13_ijFECAxDcrvrgja + je .L_AAD_blocks_14_ijFECAxDcrvrgja + cmpl $15,%r11d + je .L_AAD_blocks_15_ijFECAxDcrvrgja +.L_AAD_blocks_16_ijFECAxDcrvrgja: + subq $1536,%r12 + kmovq (%r12),%k1 + vmovdqu8 0(%r10),%zmm11 + vmovdqu8 64(%r10),%zmm3 + vmovdqu8 128(%r10),%zmm4 + vmovdqu8 192(%r10),%zmm5{%k1}{z} + vpshufb %zmm16,%zmm11,%zmm11 + vpshufb %zmm16,%zmm3,%zmm3 + vpshufb %zmm16,%zmm4,%zmm4 + vpshufb %zmm16,%zmm5,%zmm5 + vpxorq %zmm14,%zmm11,%zmm11 + vmovdqu64 96(%rdi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1 + vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6 + vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7 + vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8 + vmovdqu64 160(%rdi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm3,%zmm9 + vpclmulqdq $0x00,%zmm15,%zmm3,%zmm10 + vpclmulqdq $0x01,%zmm15,%zmm3,%zmm12 + vpclmulqdq $0x10,%zmm15,%zmm3,%zmm13 + vmovdqu64 224(%rdi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm4,%zmm11 + vpclmulqdq $0x00,%zmm15,%zmm4,%zmm3 + vpternlogq $0x96,%zmm9,%zmm11,%zmm1 + vpternlogq $0x96,%zmm10,%zmm3,%zmm6 + vpclmulqdq $0x01,%zmm15,%zmm4,%zmm11 + vpclmulqdq $0x10,%zmm15,%zmm4,%zmm3 + vpternlogq $0x96,%zmm12,%zmm11,%zmm7 + vpternlogq $0x96,%zmm13,%zmm3,%zmm8 + vmovdqu64 288(%rdi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm5,%zmm9 + vpclmulqdq $0x00,%zmm15,%zmm5,%zmm10 + vpclmulqdq $0x01,%zmm15,%zmm5,%zmm12 + vpclmulqdq $0x10,%zmm15,%zmm5,%zmm13 + vpxorq %zmm9,%zmm1,%zmm9 + vpxorq %zmm10,%zmm6,%zmm10 + vpxorq %zmm12,%zmm7,%zmm12 + vpxorq %zmm13,%zmm8,%zmm13 + + vpxorq %zmm13,%zmm12,%zmm12 + vpsrldq $8,%zmm12,%zmm7 + vpslldq $8,%zmm12,%zmm8 + vpxorq %zmm7,%zmm9,%zmm1 + vpxorq %zmm8,%zmm10,%zmm6 + vextracti64x4 $1,%zmm1,%ymm12 + vpxorq %ymm12,%ymm1,%ymm1 + vextracti32x4 $1,%ymm1,%xmm12 + vpxorq %xmm12,%xmm1,%xmm1 + vextracti64x4 $1,%zmm6,%ymm13 + vpxorq %ymm13,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm13 + vpxorq %xmm13,%xmm6,%xmm6 + vmovdqa64 POLY2(%rip),%xmm15 + + + vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7 + vpslldq $8,%xmm7,%xmm7 + vpxorq %xmm7,%xmm6,%xmm7 + + + vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8 + vpsrldq $4,%xmm8,%xmm8 + vpclmulqdq $0x10,%xmm7,%xmm15,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm1,%xmm8,%xmm14 + + jmp .L_CALC_AAD_done_ijFECAxDcrvrgja +.L_AAD_blocks_15_ijFECAxDcrvrgja: + subq $1536,%r12 + kmovq (%r12),%k1 + vmovdqu8 0(%r10),%zmm11 + vmovdqu8 64(%r10),%zmm3 + vmovdqu8 128(%r10),%zmm4 + vmovdqu8 192(%r10),%zmm5{%k1}{z} + vpshufb %zmm16,%zmm11,%zmm11 + vpshufb %zmm16,%zmm3,%zmm3 + vpshufb %zmm16,%zmm4,%zmm4 + vpshufb %zmm16,%zmm5,%zmm5 + vpxorq %zmm14,%zmm11,%zmm11 + vmovdqu64 112(%rdi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1 + vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6 + vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7 + vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8 + vmovdqu64 176(%rdi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm3,%zmm9 + vpclmulqdq $0x00,%zmm15,%zmm3,%zmm10 + vpclmulqdq $0x01,%zmm15,%zmm3,%zmm12 + vpclmulqdq $0x10,%zmm15,%zmm3,%zmm13 + vmovdqu64 240(%rdi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm4,%zmm11 + vpclmulqdq $0x00,%zmm15,%zmm4,%zmm3 + vpternlogq $0x96,%zmm1,%zmm11,%zmm9 + vpternlogq $0x96,%zmm6,%zmm3,%zmm10 + vpclmulqdq $0x01,%zmm15,%zmm4,%zmm11 + vpclmulqdq $0x10,%zmm15,%zmm4,%zmm3 + vpternlogq $0x96,%zmm7,%zmm11,%zmm12 + vpternlogq $0x96,%zmm8,%zmm3,%zmm13 + vmovdqu64 304(%rdi),%ymm15 + vinserti64x2 $2,336(%rdi),%zmm15,%zmm15 + vpclmulqdq $0x01,%zmm15,%zmm5,%zmm7 + vpclmulqdq $0x10,%zmm15,%zmm5,%zmm8 + vpclmulqdq $0x11,%zmm15,%zmm5,%zmm1 + vpclmulqdq $0x00,%zmm15,%zmm5,%zmm6 + + vpxorq %zmm12,%zmm7,%zmm7 + vpxorq %zmm13,%zmm8,%zmm8 + vpxorq %zmm9,%zmm1,%zmm1 + vpxorq %zmm10,%zmm6,%zmm6 + + vpxorq %zmm8,%zmm7,%zmm7 + vpsrldq $8,%zmm7,%zmm12 + vpslldq $8,%zmm7,%zmm13 + vpxorq %zmm12,%zmm1,%zmm1 + vpxorq %zmm13,%zmm6,%zmm6 + vextracti64x4 $1,%zmm1,%ymm12 + vpxorq %ymm12,%ymm1,%ymm1 + vextracti32x4 $1,%ymm1,%xmm12 + vpxorq %xmm12,%xmm1,%xmm1 + vextracti64x4 $1,%zmm6,%ymm13 + vpxorq %ymm13,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm13 + vpxorq %xmm13,%xmm6,%xmm6 + vmovdqa64 POLY2(%rip),%xmm15 + + + vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7 + vpslldq $8,%xmm7,%xmm7 + vpxorq %xmm7,%xmm6,%xmm7 + + + vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8 + vpsrldq $4,%xmm8,%xmm8 + vpclmulqdq $0x10,%xmm7,%xmm15,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm1,%xmm8,%xmm14 + + jmp .L_CALC_AAD_done_ijFECAxDcrvrgja +.L_AAD_blocks_14_ijFECAxDcrvrgja: + subq $1536,%r12 + kmovq (%r12),%k1 + vmovdqu8 0(%r10),%zmm11 + vmovdqu8 64(%r10),%zmm3 + vmovdqu8 128(%r10),%zmm4 + vmovdqu8 192(%r10),%ymm5{%k1}{z} + vpshufb %zmm16,%zmm11,%zmm11 + vpshufb %zmm16,%zmm3,%zmm3 + vpshufb %zmm16,%zmm4,%zmm4 + vpshufb %ymm16,%ymm5,%ymm5 + vpxorq %zmm14,%zmm11,%zmm11 + vmovdqu64 128(%rdi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1 + vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6 + vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7 + vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8 + vmovdqu64 192(%rdi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm3,%zmm9 + vpclmulqdq $0x00,%zmm15,%zmm3,%zmm10 + vpclmulqdq $0x01,%zmm15,%zmm3,%zmm12 + vpclmulqdq $0x10,%zmm15,%zmm3,%zmm13 + vmovdqu64 256(%rdi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm4,%zmm11 + vpclmulqdq $0x00,%zmm15,%zmm4,%zmm3 + vpternlogq $0x96,%zmm1,%zmm11,%zmm9 + vpternlogq $0x96,%zmm6,%zmm3,%zmm10 + vpclmulqdq $0x01,%zmm15,%zmm4,%zmm11 + vpclmulqdq $0x10,%zmm15,%zmm4,%zmm3 + vpternlogq $0x96,%zmm7,%zmm11,%zmm12 + vpternlogq $0x96,%zmm8,%zmm3,%zmm13 + vmovdqu64 320(%rdi),%ymm15 + vpclmulqdq $0x01,%ymm15,%ymm5,%ymm7 + vpclmulqdq $0x10,%ymm15,%ymm5,%ymm8 + vpclmulqdq $0x11,%ymm15,%ymm5,%ymm1 + vpclmulqdq $0x00,%ymm15,%ymm5,%ymm6 + + vpxorq %zmm12,%zmm7,%zmm7 + vpxorq %zmm13,%zmm8,%zmm8 + vpxorq %zmm9,%zmm1,%zmm1 + vpxorq %zmm10,%zmm6,%zmm6 + + vpxorq %zmm8,%zmm7,%zmm7 + vpsrldq $8,%zmm7,%zmm12 + vpslldq $8,%zmm7,%zmm13 + vpxorq %zmm12,%zmm1,%zmm1 + vpxorq %zmm13,%zmm6,%zmm6 + vextracti64x4 $1,%zmm1,%ymm12 + vpxorq %ymm12,%ymm1,%ymm1 + vextracti32x4 $1,%ymm1,%xmm12 + vpxorq %xmm12,%xmm1,%xmm1 + vextracti64x4 $1,%zmm6,%ymm13 + vpxorq %ymm13,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm13 + vpxorq %xmm13,%xmm6,%xmm6 + vmovdqa64 POLY2(%rip),%xmm15 + + + vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7 + vpslldq $8,%xmm7,%xmm7 + vpxorq %xmm7,%xmm6,%xmm7 + + + vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8 + vpsrldq $4,%xmm8,%xmm8 + vpclmulqdq $0x10,%xmm7,%xmm15,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm1,%xmm8,%xmm14 + + jmp .L_CALC_AAD_done_ijFECAxDcrvrgja +.L_AAD_blocks_13_ijFECAxDcrvrgja: + subq $1536,%r12 + kmovq (%r12),%k1 + vmovdqu8 0(%r10),%zmm11 + vmovdqu8 64(%r10),%zmm3 + vmovdqu8 128(%r10),%zmm4 + vmovdqu8 192(%r10),%xmm5{%k1}{z} + vpshufb %zmm16,%zmm11,%zmm11 + vpshufb %zmm16,%zmm3,%zmm3 + vpshufb %zmm16,%zmm4,%zmm4 + vpshufb %xmm16,%xmm5,%xmm5 + vpxorq %zmm14,%zmm11,%zmm11 + vmovdqu64 144(%rdi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1 + vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6 + vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7 + vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8 + vmovdqu64 208(%rdi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm3,%zmm9 + vpclmulqdq $0x00,%zmm15,%zmm3,%zmm10 + vpclmulqdq $0x01,%zmm15,%zmm3,%zmm12 + vpclmulqdq $0x10,%zmm15,%zmm3,%zmm13 + vmovdqu64 272(%rdi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm4,%zmm11 + vpclmulqdq $0x00,%zmm15,%zmm4,%zmm3 + vpternlogq $0x96,%zmm1,%zmm11,%zmm9 + vpternlogq $0x96,%zmm6,%zmm3,%zmm10 + vpclmulqdq $0x01,%zmm15,%zmm4,%zmm11 + vpclmulqdq $0x10,%zmm15,%zmm4,%zmm3 + vpternlogq $0x96,%zmm7,%zmm11,%zmm12 + vpternlogq $0x96,%zmm8,%zmm3,%zmm13 + vmovdqu64 336(%rdi),%xmm15 + vpclmulqdq $0x01,%xmm15,%xmm5,%xmm7 + vpclmulqdq $0x10,%xmm15,%xmm5,%xmm8 + vpclmulqdq $0x11,%xmm15,%xmm5,%xmm1 + vpclmulqdq $0x00,%xmm15,%xmm5,%xmm6 + + vpxorq %zmm12,%zmm7,%zmm7 + vpxorq %zmm13,%zmm8,%zmm8 + vpxorq %zmm9,%zmm1,%zmm1 + vpxorq %zmm10,%zmm6,%zmm6 + + vpxorq %zmm8,%zmm7,%zmm7 + vpsrldq $8,%zmm7,%zmm12 + vpslldq $8,%zmm7,%zmm13 + vpxorq %zmm12,%zmm1,%zmm1 + vpxorq %zmm13,%zmm6,%zmm6 + vextracti64x4 $1,%zmm1,%ymm12 + vpxorq %ymm12,%ymm1,%ymm1 + vextracti32x4 $1,%ymm1,%xmm12 + vpxorq %xmm12,%xmm1,%xmm1 + vextracti64x4 $1,%zmm6,%ymm13 + vpxorq %ymm13,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm13 + vpxorq %xmm13,%xmm6,%xmm6 + vmovdqa64 POLY2(%rip),%xmm15 + + + vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7 + vpslldq $8,%xmm7,%xmm7 + vpxorq %xmm7,%xmm6,%xmm7 + + + vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8 + vpsrldq $4,%xmm8,%xmm8 + vpclmulqdq $0x10,%xmm7,%xmm15,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm1,%xmm8,%xmm14 + + jmp .L_CALC_AAD_done_ijFECAxDcrvrgja +.L_AAD_blocks_12_ijFECAxDcrvrgja: + subq $1024,%r12 + kmovq (%r12),%k1 + vmovdqu8 0(%r10),%zmm11 + vmovdqu8 64(%r10),%zmm3 + vmovdqu8 128(%r10),%zmm4{%k1}{z} + vpshufb %zmm16,%zmm11,%zmm11 + vpshufb %zmm16,%zmm3,%zmm3 + vpshufb %zmm16,%zmm4,%zmm4 + vpxorq %zmm14,%zmm11,%zmm11 + vmovdqu64 160(%rdi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1 + vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6 + vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7 + vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8 + vmovdqu64 224(%rdi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm3,%zmm9 + vpclmulqdq $0x00,%zmm15,%zmm3,%zmm10 + vpclmulqdq $0x01,%zmm15,%zmm3,%zmm12 + vpclmulqdq $0x10,%zmm15,%zmm3,%zmm13 + vmovdqu64 288(%rdi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm4,%zmm11 + vpclmulqdq $0x00,%zmm15,%zmm4,%zmm3 + vpternlogq $0x96,%zmm1,%zmm11,%zmm9 + vpternlogq $0x96,%zmm6,%zmm3,%zmm10 + vpclmulqdq $0x01,%zmm15,%zmm4,%zmm11 + vpclmulqdq $0x10,%zmm15,%zmm4,%zmm3 + vpternlogq $0x96,%zmm7,%zmm11,%zmm12 + vpternlogq $0x96,%zmm8,%zmm3,%zmm13 + + vpxorq %zmm13,%zmm12,%zmm12 + vpsrldq $8,%zmm12,%zmm7 + vpslldq $8,%zmm12,%zmm8 + vpxorq %zmm7,%zmm9,%zmm1 + vpxorq %zmm8,%zmm10,%zmm6 + vextracti64x4 $1,%zmm1,%ymm12 + vpxorq %ymm12,%ymm1,%ymm1 + vextracti32x4 $1,%ymm1,%xmm12 + vpxorq %xmm12,%xmm1,%xmm1 + vextracti64x4 $1,%zmm6,%ymm13 + vpxorq %ymm13,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm13 + vpxorq %xmm13,%xmm6,%xmm6 + vmovdqa64 POLY2(%rip),%xmm15 + + + vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7 + vpslldq $8,%xmm7,%xmm7 + vpxorq %xmm7,%xmm6,%xmm7 + + + vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8 + vpsrldq $4,%xmm8,%xmm8 + vpclmulqdq $0x10,%xmm7,%xmm15,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm1,%xmm8,%xmm14 + + jmp .L_CALC_AAD_done_ijFECAxDcrvrgja +.L_AAD_blocks_11_ijFECAxDcrvrgja: + subq $1024,%r12 + kmovq (%r12),%k1 + vmovdqu8 0(%r10),%zmm11 + vmovdqu8 64(%r10),%zmm3 + vmovdqu8 128(%r10),%zmm4{%k1}{z} + vpshufb %zmm16,%zmm11,%zmm11 + vpshufb %zmm16,%zmm3,%zmm3 + vpshufb %zmm16,%zmm4,%zmm4 + vpxorq %zmm14,%zmm11,%zmm11 + vmovdqu64 176(%rdi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1 + vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6 + vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7 + vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8 + vmovdqu64 240(%rdi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm3,%zmm9 + vpclmulqdq $0x00,%zmm15,%zmm3,%zmm10 + vpclmulqdq $0x01,%zmm15,%zmm3,%zmm12 + vpclmulqdq $0x10,%zmm15,%zmm3,%zmm13 + vpxorq %zmm9,%zmm1,%zmm9 + vpxorq %zmm10,%zmm6,%zmm10 + vpxorq %zmm12,%zmm7,%zmm12 + vpxorq %zmm13,%zmm8,%zmm13 + vmovdqu64 304(%rdi),%ymm15 + vinserti64x2 $2,336(%rdi),%zmm15,%zmm15 + vpclmulqdq $0x01,%zmm15,%zmm4,%zmm7 + vpclmulqdq $0x10,%zmm15,%zmm4,%zmm8 + vpclmulqdq $0x11,%zmm15,%zmm4,%zmm1 + vpclmulqdq $0x00,%zmm15,%zmm4,%zmm6 + + vpxorq %zmm12,%zmm7,%zmm7 + vpxorq %zmm13,%zmm8,%zmm8 + vpxorq %zmm9,%zmm1,%zmm1 + vpxorq %zmm10,%zmm6,%zmm6 + + vpxorq %zmm8,%zmm7,%zmm7 + vpsrldq $8,%zmm7,%zmm12 + vpslldq $8,%zmm7,%zmm13 + vpxorq %zmm12,%zmm1,%zmm1 + vpxorq %zmm13,%zmm6,%zmm6 + vextracti64x4 $1,%zmm1,%ymm12 + vpxorq %ymm12,%ymm1,%ymm1 + vextracti32x4 $1,%ymm1,%xmm12 + vpxorq %xmm12,%xmm1,%xmm1 + vextracti64x4 $1,%zmm6,%ymm13 + vpxorq %ymm13,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm13 + vpxorq %xmm13,%xmm6,%xmm6 + vmovdqa64 POLY2(%rip),%xmm15 + + + vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7 + vpslldq $8,%xmm7,%xmm7 + vpxorq %xmm7,%xmm6,%xmm7 + + + vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8 + vpsrldq $4,%xmm8,%xmm8 + vpclmulqdq $0x10,%xmm7,%xmm15,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm1,%xmm8,%xmm14 + + jmp .L_CALC_AAD_done_ijFECAxDcrvrgja +.L_AAD_blocks_10_ijFECAxDcrvrgja: + subq $1024,%r12 + kmovq (%r12),%k1 + vmovdqu8 0(%r10),%zmm11 + vmovdqu8 64(%r10),%zmm3 + vmovdqu8 128(%r10),%ymm4{%k1}{z} + vpshufb %zmm16,%zmm11,%zmm11 + vpshufb %zmm16,%zmm3,%zmm3 + vpshufb %ymm16,%ymm4,%ymm4 + vpxorq %zmm14,%zmm11,%zmm11 + vmovdqu64 192(%rdi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1 + vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6 + vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7 + vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8 + vmovdqu64 256(%rdi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm3,%zmm9 + vpclmulqdq $0x00,%zmm15,%zmm3,%zmm10 + vpclmulqdq $0x01,%zmm15,%zmm3,%zmm12 + vpclmulqdq $0x10,%zmm15,%zmm3,%zmm13 + vpxorq %zmm9,%zmm1,%zmm9 + vpxorq %zmm10,%zmm6,%zmm10 + vpxorq %zmm12,%zmm7,%zmm12 + vpxorq %zmm13,%zmm8,%zmm13 + vmovdqu64 320(%rdi),%ymm15 + vpclmulqdq $0x01,%ymm15,%ymm4,%ymm7 + vpclmulqdq $0x10,%ymm15,%ymm4,%ymm8 + vpclmulqdq $0x11,%ymm15,%ymm4,%ymm1 + vpclmulqdq $0x00,%ymm15,%ymm4,%ymm6 + + vpxorq %zmm12,%zmm7,%zmm7 + vpxorq %zmm13,%zmm8,%zmm8 + vpxorq %zmm9,%zmm1,%zmm1 + vpxorq %zmm10,%zmm6,%zmm6 + + vpxorq %zmm8,%zmm7,%zmm7 + vpsrldq $8,%zmm7,%zmm12 + vpslldq $8,%zmm7,%zmm13 + vpxorq %zmm12,%zmm1,%zmm1 + vpxorq %zmm13,%zmm6,%zmm6 + vextracti64x4 $1,%zmm1,%ymm12 + vpxorq %ymm12,%ymm1,%ymm1 + vextracti32x4 $1,%ymm1,%xmm12 + vpxorq %xmm12,%xmm1,%xmm1 + vextracti64x4 $1,%zmm6,%ymm13 + vpxorq %ymm13,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm13 + vpxorq %xmm13,%xmm6,%xmm6 + vmovdqa64 POLY2(%rip),%xmm15 + + + vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7 + vpslldq $8,%xmm7,%xmm7 + vpxorq %xmm7,%xmm6,%xmm7 + + + vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8 + vpsrldq $4,%xmm8,%xmm8 + vpclmulqdq $0x10,%xmm7,%xmm15,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm1,%xmm8,%xmm14 + + jmp .L_CALC_AAD_done_ijFECAxDcrvrgja +.L_AAD_blocks_9_ijFECAxDcrvrgja: + subq $1024,%r12 + kmovq (%r12),%k1 + vmovdqu8 0(%r10),%zmm11 + vmovdqu8 64(%r10),%zmm3 + vmovdqu8 128(%r10),%xmm4{%k1}{z} + vpshufb %zmm16,%zmm11,%zmm11 + vpshufb %zmm16,%zmm3,%zmm3 + vpshufb %xmm16,%xmm4,%xmm4 + vpxorq %zmm14,%zmm11,%zmm11 + vmovdqu64 208(%rdi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1 + vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6 + vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7 + vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8 + vmovdqu64 272(%rdi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm3,%zmm9 + vpclmulqdq $0x00,%zmm15,%zmm3,%zmm10 + vpclmulqdq $0x01,%zmm15,%zmm3,%zmm12 + vpclmulqdq $0x10,%zmm15,%zmm3,%zmm13 + vpxorq %zmm9,%zmm1,%zmm9 + vpxorq %zmm10,%zmm6,%zmm10 + vpxorq %zmm12,%zmm7,%zmm12 + vpxorq %zmm13,%zmm8,%zmm13 + vmovdqu64 336(%rdi),%xmm15 + vpclmulqdq $0x01,%xmm15,%xmm4,%xmm7 + vpclmulqdq $0x10,%xmm15,%xmm4,%xmm8 + vpclmulqdq $0x11,%xmm15,%xmm4,%xmm1 + vpclmulqdq $0x00,%xmm15,%xmm4,%xmm6 + + vpxorq %zmm12,%zmm7,%zmm7 + vpxorq %zmm13,%zmm8,%zmm8 + vpxorq %zmm9,%zmm1,%zmm1 + vpxorq %zmm10,%zmm6,%zmm6 + + vpxorq %zmm8,%zmm7,%zmm7 + vpsrldq $8,%zmm7,%zmm12 + vpslldq $8,%zmm7,%zmm13 + vpxorq %zmm12,%zmm1,%zmm1 + vpxorq %zmm13,%zmm6,%zmm6 + vextracti64x4 $1,%zmm1,%ymm12 + vpxorq %ymm12,%ymm1,%ymm1 + vextracti32x4 $1,%ymm1,%xmm12 + vpxorq %xmm12,%xmm1,%xmm1 + vextracti64x4 $1,%zmm6,%ymm13 + vpxorq %ymm13,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm13 + vpxorq %xmm13,%xmm6,%xmm6 + vmovdqa64 POLY2(%rip),%xmm15 + + + vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7 + vpslldq $8,%xmm7,%xmm7 + vpxorq %xmm7,%xmm6,%xmm7 + + + vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8 + vpsrldq $4,%xmm8,%xmm8 + vpclmulqdq $0x10,%xmm7,%xmm15,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm1,%xmm8,%xmm14 + + jmp .L_CALC_AAD_done_ijFECAxDcrvrgja +.L_AAD_blocks_8_ijFECAxDcrvrgja: + subq $512,%r12 + kmovq (%r12),%k1 + vmovdqu8 0(%r10),%zmm11 + vmovdqu8 64(%r10),%zmm3{%k1}{z} + vpshufb %zmm16,%zmm11,%zmm11 + vpshufb %zmm16,%zmm3,%zmm3 + vpxorq %zmm14,%zmm11,%zmm11 + vmovdqu64 224(%rdi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1 + vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6 + vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7 + vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8 + vmovdqu64 288(%rdi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm3,%zmm9 + vpclmulqdq $0x00,%zmm15,%zmm3,%zmm10 + vpclmulqdq $0x01,%zmm15,%zmm3,%zmm12 + vpclmulqdq $0x10,%zmm15,%zmm3,%zmm13 + vpxorq %zmm9,%zmm1,%zmm9 + vpxorq %zmm10,%zmm6,%zmm10 + vpxorq %zmm12,%zmm7,%zmm12 + vpxorq %zmm13,%zmm8,%zmm13 + + vpxorq %zmm13,%zmm12,%zmm12 + vpsrldq $8,%zmm12,%zmm7 + vpslldq $8,%zmm12,%zmm8 + vpxorq %zmm7,%zmm9,%zmm1 + vpxorq %zmm8,%zmm10,%zmm6 + vextracti64x4 $1,%zmm1,%ymm12 + vpxorq %ymm12,%ymm1,%ymm1 + vextracti32x4 $1,%ymm1,%xmm12 + vpxorq %xmm12,%xmm1,%xmm1 + vextracti64x4 $1,%zmm6,%ymm13 + vpxorq %ymm13,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm13 + vpxorq %xmm13,%xmm6,%xmm6 + vmovdqa64 POLY2(%rip),%xmm15 + + + vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7 + vpslldq $8,%xmm7,%xmm7 + vpxorq %xmm7,%xmm6,%xmm7 + + + vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8 + vpsrldq $4,%xmm8,%xmm8 + vpclmulqdq $0x10,%xmm7,%xmm15,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm1,%xmm8,%xmm14 + + jmp .L_CALC_AAD_done_ijFECAxDcrvrgja +.L_AAD_blocks_7_ijFECAxDcrvrgja: + subq $512,%r12 + kmovq (%r12),%k1 + vmovdqu8 0(%r10),%zmm11 + vmovdqu8 64(%r10),%zmm3{%k1}{z} + vpshufb %zmm16,%zmm11,%zmm11 + vpshufb %zmm16,%zmm3,%zmm3 + vpxorq %zmm14,%zmm11,%zmm11 + vmovdqu64 240(%rdi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm11,%zmm9 + vpclmulqdq $0x00,%zmm15,%zmm11,%zmm10 + vpclmulqdq $0x01,%zmm15,%zmm11,%zmm12 + vpclmulqdq $0x10,%zmm15,%zmm11,%zmm13 + vmovdqu64 304(%rdi),%ymm15 + vinserti64x2 $2,336(%rdi),%zmm15,%zmm15 + vpclmulqdq $0x01,%zmm15,%zmm3,%zmm7 + vpclmulqdq $0x10,%zmm15,%zmm3,%zmm8 + vpclmulqdq $0x11,%zmm15,%zmm3,%zmm1 + vpclmulqdq $0x00,%zmm15,%zmm3,%zmm6 + + vpxorq %zmm12,%zmm7,%zmm7 + vpxorq %zmm13,%zmm8,%zmm8 + vpxorq %zmm9,%zmm1,%zmm1 + vpxorq %zmm10,%zmm6,%zmm6 + + vpxorq %zmm8,%zmm7,%zmm7 + vpsrldq $8,%zmm7,%zmm12 + vpslldq $8,%zmm7,%zmm13 + vpxorq %zmm12,%zmm1,%zmm1 + vpxorq %zmm13,%zmm6,%zmm6 + vextracti64x4 $1,%zmm1,%ymm12 + vpxorq %ymm12,%ymm1,%ymm1 + vextracti32x4 $1,%ymm1,%xmm12 + vpxorq %xmm12,%xmm1,%xmm1 + vextracti64x4 $1,%zmm6,%ymm13 + vpxorq %ymm13,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm13 + vpxorq %xmm13,%xmm6,%xmm6 + vmovdqa64 POLY2(%rip),%xmm15 + + + vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7 + vpslldq $8,%xmm7,%xmm7 + vpxorq %xmm7,%xmm6,%xmm7 + + + vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8 + vpsrldq $4,%xmm8,%xmm8 + vpclmulqdq $0x10,%xmm7,%xmm15,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm1,%xmm8,%xmm14 + + jmp .L_CALC_AAD_done_ijFECAxDcrvrgja +.L_AAD_blocks_6_ijFECAxDcrvrgja: + subq $512,%r12 + kmovq (%r12),%k1 + vmovdqu8 0(%r10),%zmm11 + vmovdqu8 64(%r10),%ymm3{%k1}{z} + vpshufb %zmm16,%zmm11,%zmm11 + vpshufb %ymm16,%ymm3,%ymm3 + vpxorq %zmm14,%zmm11,%zmm11 + vmovdqu64 256(%rdi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm11,%zmm9 + vpclmulqdq $0x00,%zmm15,%zmm11,%zmm10 + vpclmulqdq $0x01,%zmm15,%zmm11,%zmm12 + vpclmulqdq $0x10,%zmm15,%zmm11,%zmm13 + vmovdqu64 320(%rdi),%ymm15 + vpclmulqdq $0x01,%ymm15,%ymm3,%ymm7 + vpclmulqdq $0x10,%ymm15,%ymm3,%ymm8 + vpclmulqdq $0x11,%ymm15,%ymm3,%ymm1 + vpclmulqdq $0x00,%ymm15,%ymm3,%ymm6 + + vpxorq %zmm12,%zmm7,%zmm7 + vpxorq %zmm13,%zmm8,%zmm8 + vpxorq %zmm9,%zmm1,%zmm1 + vpxorq %zmm10,%zmm6,%zmm6 + + vpxorq %zmm8,%zmm7,%zmm7 + vpsrldq $8,%zmm7,%zmm12 + vpslldq $8,%zmm7,%zmm13 + vpxorq %zmm12,%zmm1,%zmm1 + vpxorq %zmm13,%zmm6,%zmm6 + vextracti64x4 $1,%zmm1,%ymm12 + vpxorq %ymm12,%ymm1,%ymm1 + vextracti32x4 $1,%ymm1,%xmm12 + vpxorq %xmm12,%xmm1,%xmm1 + vextracti64x4 $1,%zmm6,%ymm13 + vpxorq %ymm13,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm13 + vpxorq %xmm13,%xmm6,%xmm6 + vmovdqa64 POLY2(%rip),%xmm15 + + + vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7 + vpslldq $8,%xmm7,%xmm7 + vpxorq %xmm7,%xmm6,%xmm7 + + + vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8 + vpsrldq $4,%xmm8,%xmm8 + vpclmulqdq $0x10,%xmm7,%xmm15,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm1,%xmm8,%xmm14 + + jmp .L_CALC_AAD_done_ijFECAxDcrvrgja +.L_AAD_blocks_5_ijFECAxDcrvrgja: + subq $512,%r12 + kmovq (%r12),%k1 + vmovdqu8 0(%r10),%zmm11 + vmovdqu8 64(%r10),%xmm3{%k1}{z} + vpshufb %zmm16,%zmm11,%zmm11 + vpshufb %xmm16,%xmm3,%xmm3 + vpxorq %zmm14,%zmm11,%zmm11 + vmovdqu64 272(%rdi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm11,%zmm9 + vpclmulqdq $0x00,%zmm15,%zmm11,%zmm10 + vpclmulqdq $0x01,%zmm15,%zmm11,%zmm12 + vpclmulqdq $0x10,%zmm15,%zmm11,%zmm13 + vmovdqu64 336(%rdi),%xmm15 + vpclmulqdq $0x01,%xmm15,%xmm3,%xmm7 + vpclmulqdq $0x10,%xmm15,%xmm3,%xmm8 + vpclmulqdq $0x11,%xmm15,%xmm3,%xmm1 + vpclmulqdq $0x00,%xmm15,%xmm3,%xmm6 + + vpxorq %zmm12,%zmm7,%zmm7 + vpxorq %zmm13,%zmm8,%zmm8 + vpxorq %zmm9,%zmm1,%zmm1 + vpxorq %zmm10,%zmm6,%zmm6 + + vpxorq %zmm8,%zmm7,%zmm7 + vpsrldq $8,%zmm7,%zmm12 + vpslldq $8,%zmm7,%zmm13 + vpxorq %zmm12,%zmm1,%zmm1 + vpxorq %zmm13,%zmm6,%zmm6 + vextracti64x4 $1,%zmm1,%ymm12 + vpxorq %ymm12,%ymm1,%ymm1 + vextracti32x4 $1,%ymm1,%xmm12 + vpxorq %xmm12,%xmm1,%xmm1 + vextracti64x4 $1,%zmm6,%ymm13 + vpxorq %ymm13,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm13 + vpxorq %xmm13,%xmm6,%xmm6 + vmovdqa64 POLY2(%rip),%xmm15 + + + vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7 + vpslldq $8,%xmm7,%xmm7 + vpxorq %xmm7,%xmm6,%xmm7 + + + vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8 + vpsrldq $4,%xmm8,%xmm8 + vpclmulqdq $0x10,%xmm7,%xmm15,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm1,%xmm8,%xmm14 + + jmp .L_CALC_AAD_done_ijFECAxDcrvrgja +.L_AAD_blocks_4_ijFECAxDcrvrgja: + kmovq (%r12),%k1 + vmovdqu8 0(%r10),%zmm11{%k1}{z} + vpshufb %zmm16,%zmm11,%zmm11 + vpxorq %zmm14,%zmm11,%zmm11 + vmovdqu64 288(%rdi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm11,%zmm9 + vpclmulqdq $0x00,%zmm15,%zmm11,%zmm10 + vpclmulqdq $0x01,%zmm15,%zmm11,%zmm12 + vpclmulqdq $0x10,%zmm15,%zmm11,%zmm13 + + vpxorq %zmm13,%zmm12,%zmm12 + vpsrldq $8,%zmm12,%zmm7 + vpslldq $8,%zmm12,%zmm8 + vpxorq %zmm7,%zmm9,%zmm1 + vpxorq %zmm8,%zmm10,%zmm6 + vextracti64x4 $1,%zmm1,%ymm12 + vpxorq %ymm12,%ymm1,%ymm1 + vextracti32x4 $1,%ymm1,%xmm12 + vpxorq %xmm12,%xmm1,%xmm1 + vextracti64x4 $1,%zmm6,%ymm13 + vpxorq %ymm13,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm13 + vpxorq %xmm13,%xmm6,%xmm6 + vmovdqa64 POLY2(%rip),%xmm15 + + + vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7 + vpslldq $8,%xmm7,%xmm7 + vpxorq %xmm7,%xmm6,%xmm7 + + + vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8 + vpsrldq $4,%xmm8,%xmm8 + vpclmulqdq $0x10,%xmm7,%xmm15,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm1,%xmm8,%xmm14 + + jmp .L_CALC_AAD_done_ijFECAxDcrvrgja +.L_AAD_blocks_3_ijFECAxDcrvrgja: + kmovq (%r12),%k1 + vmovdqu8 0(%r10),%zmm11{%k1}{z} + vpshufb %zmm16,%zmm11,%zmm11 + vpxorq %zmm14,%zmm11,%zmm11 + vmovdqu64 304(%rdi),%ymm15 + vinserti64x2 $2,336(%rdi),%zmm15,%zmm15 + vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7 + vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8 + vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1 + vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6 + + vpxorq %zmm8,%zmm7,%zmm7 + vpsrldq $8,%zmm7,%zmm12 + vpslldq $8,%zmm7,%zmm13 + vpxorq %zmm12,%zmm1,%zmm1 + vpxorq %zmm13,%zmm6,%zmm6 + vextracti64x4 $1,%zmm1,%ymm12 + vpxorq %ymm12,%ymm1,%ymm1 + vextracti32x4 $1,%ymm1,%xmm12 + vpxorq %xmm12,%xmm1,%xmm1 + vextracti64x4 $1,%zmm6,%ymm13 + vpxorq %ymm13,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm13 + vpxorq %xmm13,%xmm6,%xmm6 + vmovdqa64 POLY2(%rip),%xmm15 + + + vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7 + vpslldq $8,%xmm7,%xmm7 + vpxorq %xmm7,%xmm6,%xmm7 + + + vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8 + vpsrldq $4,%xmm8,%xmm8 + vpclmulqdq $0x10,%xmm7,%xmm15,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm1,%xmm8,%xmm14 + + jmp .L_CALC_AAD_done_ijFECAxDcrvrgja +.L_AAD_blocks_2_ijFECAxDcrvrgja: + kmovq (%r12),%k1 + vmovdqu8 0(%r10),%ymm11{%k1}{z} + vpshufb %ymm16,%ymm11,%ymm11 + vpxorq %zmm14,%zmm11,%zmm11 + vmovdqu64 320(%rdi),%ymm15 + vpclmulqdq $0x01,%ymm15,%ymm11,%ymm7 + vpclmulqdq $0x10,%ymm15,%ymm11,%ymm8 + vpclmulqdq $0x11,%ymm15,%ymm11,%ymm1 + vpclmulqdq $0x00,%ymm15,%ymm11,%ymm6 + + vpxorq %zmm8,%zmm7,%zmm7 + vpsrldq $8,%zmm7,%zmm12 + vpslldq $8,%zmm7,%zmm13 + vpxorq %zmm12,%zmm1,%zmm1 + vpxorq %zmm13,%zmm6,%zmm6 + vextracti64x4 $1,%zmm1,%ymm12 + vpxorq %ymm12,%ymm1,%ymm1 + vextracti32x4 $1,%ymm1,%xmm12 + vpxorq %xmm12,%xmm1,%xmm1 + vextracti64x4 $1,%zmm6,%ymm13 + vpxorq %ymm13,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm13 + vpxorq %xmm13,%xmm6,%xmm6 + vmovdqa64 POLY2(%rip),%xmm15 + + + vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7 + vpslldq $8,%xmm7,%xmm7 + vpxorq %xmm7,%xmm6,%xmm7 + + + vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8 + vpsrldq $4,%xmm8,%xmm8 + vpclmulqdq $0x10,%xmm7,%xmm15,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm1,%xmm8,%xmm14 + + jmp .L_CALC_AAD_done_ijFECAxDcrvrgja +.L_AAD_blocks_1_ijFECAxDcrvrgja: + kmovq (%r12),%k1 + vmovdqu8 0(%r10),%xmm11{%k1}{z} + vpshufb %xmm16,%xmm11,%xmm11 + vpxorq %zmm14,%zmm11,%zmm11 + vmovdqu64 336(%rdi),%xmm15 + vpclmulqdq $0x01,%xmm15,%xmm11,%xmm7 + vpclmulqdq $0x10,%xmm15,%xmm11,%xmm8 + vpclmulqdq $0x11,%xmm15,%xmm11,%xmm1 + vpclmulqdq $0x00,%xmm15,%xmm11,%xmm6 + + vpxorq %zmm8,%zmm7,%zmm7 + vpsrldq $8,%zmm7,%zmm12 + vpslldq $8,%zmm7,%zmm13 + vpxorq %zmm12,%zmm1,%zmm1 + vpxorq %zmm13,%zmm6,%zmm6 + vextracti64x4 $1,%zmm1,%ymm12 + vpxorq %ymm12,%ymm1,%ymm1 + vextracti32x4 $1,%ymm1,%xmm12 + vpxorq %xmm12,%xmm1,%xmm1 + vextracti64x4 $1,%zmm6,%ymm13 + vpxorq %ymm13,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm13 + vpxorq %xmm13,%xmm6,%xmm6 + vmovdqa64 POLY2(%rip),%xmm15 + + + vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7 + vpslldq $8,%xmm7,%xmm7 + vpxorq %xmm7,%xmm6,%xmm7 + + + vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8 + vpsrldq $4,%xmm8,%xmm8 + vpclmulqdq $0x10,%xmm7,%xmm15,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm1,%xmm8,%xmm14 + +.L_CALC_AAD_done_ijFECAxDcrvrgja: + vmovdqu64 %xmm14,64(%rdi) + cmpq $256,%rdx + jbe .Lskip_hkeys_cleanup_qbvewaDGpzpiiAA + vpxor %xmm0,%xmm0,%xmm0 + vmovdqa64 %zmm0,0(%rsp) + vmovdqa64 %zmm0,64(%rsp) + vmovdqa64 %zmm0,128(%rsp) + vmovdqa64 %zmm0,192(%rsp) + vmovdqa64 %zmm0,256(%rsp) + vmovdqa64 %zmm0,320(%rsp) + vmovdqa64 %zmm0,384(%rsp) + vmovdqa64 %zmm0,448(%rsp) + vmovdqa64 %zmm0,512(%rsp) + vmovdqa64 %zmm0,576(%rsp) + vmovdqa64 %zmm0,640(%rsp) + vmovdqa64 %zmm0,704(%rsp) +.Lskip_hkeys_cleanup_qbvewaDGpzpiiAA: + vzeroupper + leaq (%rbp),%rsp +.cfi_def_cfa_register %rsp + popq %r15 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r15 + popq %r14 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r14 + popq %r13 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r13 + popq %r12 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r12 + popq %rbp +.cfi_adjust_cfa_offset -8 +.cfi_restore %rbp + popq %rbx +.cfi_adjust_cfa_offset -8 +.cfi_restore %rbx +.Lexit_update_aad: + .byte 0xf3,0xc3 +.Lghash_seh_end: +.cfi_endproc +.size ossl_aes_gcm_update_aad_avx512, .-ossl_aes_gcm_update_aad_avx512 +.globl ossl_aes_gcm_encrypt_avx512 +.type ossl_aes_gcm_encrypt_avx512,@function +.align 32 +ossl_aes_gcm_encrypt_avx512: +.cfi_startproc +.Lencrypt_seh_begin: +.byte 243,15,30,250 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 +.Lencrypt_seh_push_rbx: + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 +.Lencrypt_seh_push_rbp: + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 +.Lencrypt_seh_push_r12: + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 +.Lencrypt_seh_push_r13: + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 +.Lencrypt_seh_push_r14: + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 +.Lencrypt_seh_push_r15: + + + + + + + + + + + leaq 0(%rsp),%rbp +.cfi_def_cfa_register %rbp +.Lencrypt_seh_setfp: + +.Lencrypt_seh_prolog_end: + subq $1588,%rsp + andq $(-64),%rsp + + + movl 240(%rdi),%eax + cmpl $9,%eax + je .Laes_gcm_encrypt_128_avx512 + cmpl $11,%eax + je .Laes_gcm_encrypt_192_avx512 + cmpl $13,%eax + je .Laes_gcm_encrypt_256_avx512 + xorl %eax,%eax + jmp .Lexit_gcm_encrypt +.align 32 +.Laes_gcm_encrypt_128_avx512: + orq %r8,%r8 + je .L_enc_dec_done_pdDdEbGtmhbgzzj + xorq %r14,%r14 + vmovdqu64 64(%rsi),%xmm14 + + movq (%rdx),%r11 + orq %r11,%r11 + je .L_partial_block_done_pxhfCnBixjkllFd + movl $16,%r10d + leaq byte_len_to_mask_table(%rip),%r12 + cmpq %r10,%r8 + cmovcq %r8,%r10 + kmovw (%r12,%r10,2),%k1 + vmovdqu8 (%rcx),%xmm0{%k1}{z} + + vmovdqu64 16(%rsi),%xmm3 + vmovdqu64 336(%rsi),%xmm4 + + + + leaq SHIFT_MASK(%rip),%r12 + addq %r11,%r12 + vmovdqu64 (%r12),%xmm5 + vpshufb %xmm5,%xmm3,%xmm3 + vpxorq %xmm0,%xmm3,%xmm3 + + + leaq (%r8,%r11,1),%r13 + subq $16,%r13 + jge .L_no_extra_mask_pxhfCnBixjkllFd + subq %r13,%r12 +.L_no_extra_mask_pxhfCnBixjkllFd: + + + + vmovdqu64 16(%r12),%xmm0 + vpand %xmm0,%xmm3,%xmm3 + vpshufb SHUF_MASK(%rip),%xmm3,%xmm3 + vpshufb %xmm5,%xmm3,%xmm3 + vpxorq %xmm3,%xmm14,%xmm14 + cmpq $0,%r13 + jl .L_partial_incomplete_pxhfCnBixjkllFd + + vpclmulqdq $0x11,%xmm4,%xmm14,%xmm7 + vpclmulqdq $0x00,%xmm4,%xmm14,%xmm10 + vpclmulqdq $0x01,%xmm4,%xmm14,%xmm11 + vpclmulqdq $0x10,%xmm4,%xmm14,%xmm14 + vpxorq %xmm11,%xmm14,%xmm14 + + vpsrldq $8,%xmm14,%xmm11 + vpslldq $8,%xmm14,%xmm14 + vpxorq %xmm11,%xmm7,%xmm7 + vpxorq %xmm10,%xmm14,%xmm14 + + + + vmovdqu64 POLY2(%rip),%xmm11 + + vpclmulqdq $0x01,%xmm14,%xmm11,%xmm10 + vpslldq $8,%xmm10,%xmm10 + vpxorq %xmm10,%xmm14,%xmm14 + + + + vpclmulqdq $0x00,%xmm14,%xmm11,%xmm10 + vpsrldq $4,%xmm10,%xmm10 + vpclmulqdq $0x10,%xmm14,%xmm11,%xmm14 + vpslldq $4,%xmm14,%xmm14 + + vpternlogq $0x96,%xmm10,%xmm7,%xmm14 + + movq $0,(%rdx) + + movq %r11,%r12 + movq $16,%r11 + subq %r12,%r11 + jmp .L_enc_dec_done_pxhfCnBixjkllFd + +.L_partial_incomplete_pxhfCnBixjkllFd: + addq %r8,(%rdx) + movq %r8,%r11 + +.L_enc_dec_done_pxhfCnBixjkllFd: + + + leaq byte_len_to_mask_table(%rip),%r12 + kmovw (%r12,%r11,2),%k1 + vmovdqu64 %xmm14,64(%rsi) + + vpshufb SHUF_MASK(%rip),%xmm3,%xmm3 + vpshufb %xmm5,%xmm3,%xmm3 + movq %r9,%r12 + vmovdqu8 %xmm3,(%r12){%k1} +.L_partial_block_done_pxhfCnBixjkllFd: + vmovdqu64 0(%rsi),%xmm2 + subq %r11,%r8 + je .L_enc_dec_done_pdDdEbGtmhbgzzj + cmpq $256,%r8 + jbe .L_message_below_equal_16_blocks_pdDdEbGtmhbgzzj + + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vmovdqa64 ddq_addbe_4444(%rip),%zmm27 + vmovdqa64 ddq_addbe_1234(%rip),%zmm28 + + + + + + + vmovd %xmm2,%r15d + andl $255,%r15d + + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpshufb %zmm29,%zmm2,%zmm2 + + + + cmpb $240,%r15b + jae .L_next_16_overflow_mapiDClopxEitar + vpaddd %zmm28,%zmm2,%zmm7 + vpaddd %zmm27,%zmm7,%zmm10 + vpaddd %zmm27,%zmm10,%zmm11 + vpaddd %zmm27,%zmm11,%zmm12 + jmp .L_next_16_ok_mapiDClopxEitar +.L_next_16_overflow_mapiDClopxEitar: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm12 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm7 + vpaddd %zmm12,%zmm7,%zmm10 + vpaddd %zmm12,%zmm10,%zmm11 + vpaddd %zmm12,%zmm11,%zmm12 + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %zmm29,%zmm10,%zmm10 + vpshufb %zmm29,%zmm11,%zmm11 + vpshufb %zmm29,%zmm12,%zmm12 +.L_next_16_ok_mapiDClopxEitar: + vshufi64x2 $255,%zmm12,%zmm12,%zmm2 + addb $16,%r15b + + vmovdqu8 0(%rcx,%r11,1),%zmm0 + vmovdqu8 64(%rcx,%r11,1),%zmm3 + vmovdqu8 128(%rcx,%r11,1),%zmm4 + vmovdqu8 192(%rcx,%r11,1),%zmm5 + + + vbroadcastf64x2 0(%rdi),%zmm6 + vpxorq %zmm6,%zmm7,%zmm7 + vpxorq %zmm6,%zmm10,%zmm10 + vpxorq %zmm6,%zmm11,%zmm11 + vpxorq %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 16(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 32(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 48(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 64(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 80(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 96(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 112(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 128(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 144(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 160(%rdi),%zmm6 + vaesenclast %zmm6,%zmm7,%zmm7 + vaesenclast %zmm6,%zmm10,%zmm10 + vaesenclast %zmm6,%zmm11,%zmm11 + vaesenclast %zmm6,%zmm12,%zmm12 + + + vpxorq %zmm0,%zmm7,%zmm7 + vpxorq %zmm3,%zmm10,%zmm10 + vpxorq %zmm4,%zmm11,%zmm11 + vpxorq %zmm5,%zmm12,%zmm12 + + + movq %r9,%r10 + vmovdqu8 %zmm7,0(%r10,%r11,1) + vmovdqu8 %zmm10,64(%r10,%r11,1) + vmovdqu8 %zmm11,128(%r10,%r11,1) + vmovdqu8 %zmm12,192(%r10,%r11,1) + + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %zmm29,%zmm10,%zmm10 + vpshufb %zmm29,%zmm11,%zmm11 + vpshufb %zmm29,%zmm12,%zmm12 + vmovdqa64 %zmm7,768(%rsp) + vmovdqa64 %zmm10,832(%rsp) + vmovdqa64 %zmm11,896(%rsp) + vmovdqa64 %zmm12,960(%rsp) + testq %r14,%r14 + jnz .L_skip_hkeys_precomputation_wEgffnstFkkCiax + + vmovdqu64 288(%rsi),%zmm0 + vmovdqu64 %zmm0,704(%rsp) + + vmovdqu64 224(%rsi),%zmm3 + vmovdqu64 %zmm3,640(%rsp) + + + vshufi64x2 $0x00,%zmm3,%zmm3,%zmm3 + + vmovdqu64 160(%rsi),%zmm4 + vmovdqu64 %zmm4,576(%rsp) + + vmovdqu64 96(%rsi),%zmm5 + vmovdqu64 %zmm5,512(%rsp) +.L_skip_hkeys_precomputation_wEgffnstFkkCiax: + cmpq $512,%r8 + jb .L_message_below_32_blocks_pdDdEbGtmhbgzzj + + + + cmpb $240,%r15b + jae .L_next_16_overflow_lzgFuCogmBcsocA + vpaddd %zmm28,%zmm2,%zmm7 + vpaddd %zmm27,%zmm7,%zmm10 + vpaddd %zmm27,%zmm10,%zmm11 + vpaddd %zmm27,%zmm11,%zmm12 + jmp .L_next_16_ok_lzgFuCogmBcsocA +.L_next_16_overflow_lzgFuCogmBcsocA: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm12 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm7 + vpaddd %zmm12,%zmm7,%zmm10 + vpaddd %zmm12,%zmm10,%zmm11 + vpaddd %zmm12,%zmm11,%zmm12 + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %zmm29,%zmm10,%zmm10 + vpshufb %zmm29,%zmm11,%zmm11 + vpshufb %zmm29,%zmm12,%zmm12 +.L_next_16_ok_lzgFuCogmBcsocA: + vshufi64x2 $255,%zmm12,%zmm12,%zmm2 + addb $16,%r15b + + vmovdqu8 256(%rcx,%r11,1),%zmm0 + vmovdqu8 320(%rcx,%r11,1),%zmm3 + vmovdqu8 384(%rcx,%r11,1),%zmm4 + vmovdqu8 448(%rcx,%r11,1),%zmm5 + + + vbroadcastf64x2 0(%rdi),%zmm6 + vpxorq %zmm6,%zmm7,%zmm7 + vpxorq %zmm6,%zmm10,%zmm10 + vpxorq %zmm6,%zmm11,%zmm11 + vpxorq %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 16(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 32(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 48(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 64(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 80(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 96(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 112(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 128(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 144(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 160(%rdi),%zmm6 + vaesenclast %zmm6,%zmm7,%zmm7 + vaesenclast %zmm6,%zmm10,%zmm10 + vaesenclast %zmm6,%zmm11,%zmm11 + vaesenclast %zmm6,%zmm12,%zmm12 + + + vpxorq %zmm0,%zmm7,%zmm7 + vpxorq %zmm3,%zmm10,%zmm10 + vpxorq %zmm4,%zmm11,%zmm11 + vpxorq %zmm5,%zmm12,%zmm12 + + + movq %r9,%r10 + vmovdqu8 %zmm7,256(%r10,%r11,1) + vmovdqu8 %zmm10,320(%r10,%r11,1) + vmovdqu8 %zmm11,384(%r10,%r11,1) + vmovdqu8 %zmm12,448(%r10,%r11,1) + + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %zmm29,%zmm10,%zmm10 + vpshufb %zmm29,%zmm11,%zmm11 + vpshufb %zmm29,%zmm12,%zmm12 + vmovdqa64 %zmm7,1024(%rsp) + vmovdqa64 %zmm10,1088(%rsp) + vmovdqa64 %zmm11,1152(%rsp) + vmovdqa64 %zmm12,1216(%rsp) + testq %r14,%r14 + jnz .L_skip_hkeys_precomputation_fxgusndxuFFGjih + vmovdqu64 640(%rsp),%zmm3 + + + vshufi64x2 $0x00,%zmm3,%zmm3,%zmm3 + + vmovdqu64 576(%rsp),%zmm4 + vmovdqu64 512(%rsp),%zmm5 + + vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4 + vpxorq %zmm10,%zmm4,%zmm4 + + vpsrldq $8,%zmm4,%zmm10 + vpslldq $8,%zmm4,%zmm4 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4 + vpslldq $4,%zmm4,%zmm4 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm4 + + vmovdqu64 %zmm4,448(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5 + vpxorq %zmm10,%zmm5,%zmm5 + + vpsrldq $8,%zmm5,%zmm10 + vpslldq $8,%zmm5,%zmm5 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5 + vpslldq $4,%zmm5,%zmm5 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm5 + + vmovdqu64 %zmm5,384(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4 + vpxorq %zmm10,%zmm4,%zmm4 + + vpsrldq $8,%zmm4,%zmm10 + vpslldq $8,%zmm4,%zmm4 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4 + vpslldq $4,%zmm4,%zmm4 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm4 + + vmovdqu64 %zmm4,320(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5 + vpxorq %zmm10,%zmm5,%zmm5 + + vpsrldq $8,%zmm5,%zmm10 + vpslldq $8,%zmm5,%zmm5 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5 + vpslldq $4,%zmm5,%zmm5 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm5 + + vmovdqu64 %zmm5,256(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4 + vpxorq %zmm10,%zmm4,%zmm4 + + vpsrldq $8,%zmm4,%zmm10 + vpslldq $8,%zmm4,%zmm4 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4 + vpslldq $4,%zmm4,%zmm4 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm4 + + vmovdqu64 %zmm4,192(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5 + vpxorq %zmm10,%zmm5,%zmm5 + + vpsrldq $8,%zmm5,%zmm10 + vpslldq $8,%zmm5,%zmm5 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5 + vpslldq $4,%zmm5,%zmm5 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm5 + + vmovdqu64 %zmm5,128(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4 + vpxorq %zmm10,%zmm4,%zmm4 + + vpsrldq $8,%zmm4,%zmm10 + vpslldq $8,%zmm4,%zmm4 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4 + vpslldq $4,%zmm4,%zmm4 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm4 + + vmovdqu64 %zmm4,64(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5 + vpxorq %zmm10,%zmm5,%zmm5 + + vpsrldq $8,%zmm5,%zmm10 + vpslldq $8,%zmm5,%zmm5 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5 + vpslldq $4,%zmm5,%zmm5 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm5 + + vmovdqu64 %zmm5,0(%rsp) +.L_skip_hkeys_precomputation_fxgusndxuFFGjih: + movq $1,%r14 + addq $512,%r11 + subq $512,%r8 + + cmpq $768,%r8 + jb .L_no_more_big_nblocks_pdDdEbGtmhbgzzj +.L_encrypt_big_nblocks_pdDdEbGtmhbgzzj: + cmpb $240,%r15b + jae .L_16_blocks_overflow_ibqhltvwwkyjEta + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_ibqhltvwwkyjEta +.L_16_blocks_overflow_ibqhltvwwkyjEta: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_ibqhltvwwkyjEta: + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp),%zmm1 + + + + + vshufi64x2 $255,%zmm5,%zmm5,%zmm2 + addb $16,%r15b + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + + + + + + + + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm6 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + + + + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm15,%zmm10,%zmm26 + vpxorq %zmm12,%zmm6,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + + + + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + + + + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1) + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 + vmovdqa64 %zmm0,1280(%rsp) + vmovdqa64 %zmm3,1344(%rsp) + vmovdqa64 %zmm4,1408(%rsp) + vmovdqa64 %zmm5,1472(%rsp) + cmpb $240,%r15b + jae .L_16_blocks_overflow_cEaavogFAbujiEy + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_cEaavogFAbujiEy +.L_16_blocks_overflow_cEaavogFAbujiEy: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_cEaavogFAbujiEy: + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 256(%rsp),%zmm1 + + + + + vshufi64x2 $255,%zmm5,%zmm5,%zmm2 + addb $16,%r15b + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 320(%rsp),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + + + + + + + + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 384(%rsp),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 448(%rsp),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm6 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + + + + vmovdqu8 256(%rcx,%r11,1),%zmm17 + vmovdqu8 320(%rcx,%r11,1),%zmm19 + vmovdqu8 384(%rcx,%r11,1),%zmm20 + vmovdqu8 448(%rcx,%r11,1),%zmm21 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vpternlogq $0x96,%zmm12,%zmm6,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + + + + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + + + + movq %r9,%r10 + vmovdqu8 %zmm0,256(%r10,%r11,1) + vmovdqu8 %zmm3,320(%r10,%r11,1) + vmovdqu8 %zmm4,384(%r10,%r11,1) + vmovdqu8 %zmm5,448(%r10,%r11,1) + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 + vmovdqa64 %zmm0,768(%rsp) + vmovdqa64 %zmm3,832(%rsp) + vmovdqa64 %zmm4,896(%rsp) + vmovdqa64 %zmm5,960(%rsp) + cmpb $240,%r15b + jae .L_16_blocks_overflow_usjsvymwkviypdp + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_usjsvymwkviypdp +.L_16_blocks_overflow_usjsvymwkviypdp: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_usjsvymwkviypdp: + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + + + + + vshufi64x2 $255,%zmm5,%zmm5,%zmm2 + addb $16,%r15b + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + + + + + + + + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm6 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + + + + vmovdqu8 512(%rcx,%r11,1),%zmm17 + vmovdqu8 576(%rcx,%r11,1),%zmm19 + vmovdqu8 640(%rcx,%r11,1),%zmm20 + vmovdqu8 704(%rcx,%r11,1),%zmm21 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + + + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vpternlogq $0x96,%zmm15,%zmm12,%zmm6 + vpxorq %zmm24,%zmm6,%zmm6 + vpternlogq $0x96,%zmm10,%zmm13,%zmm7 + vpxorq %zmm25,%zmm7,%zmm7 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vextracti64x4 $1,%zmm6,%ymm12 + vpxorq %ymm12,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm12 + vpxorq %xmm12,%xmm6,%xmm6 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm6 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + + + + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + + + + movq %r9,%r10 + vmovdqu8 %zmm0,512(%r10,%r11,1) + vmovdqu8 %zmm3,576(%r10,%r11,1) + vmovdqu8 %zmm4,640(%r10,%r11,1) + vmovdqu8 %zmm5,704(%r10,%r11,1) + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 + vmovdqa64 %zmm0,1024(%rsp) + vmovdqa64 %zmm3,1088(%rsp) + vmovdqa64 %zmm4,1152(%rsp) + vmovdqa64 %zmm5,1216(%rsp) + vmovdqa64 %zmm6,%zmm14 + + addq $768,%r11 + subq $768,%r8 + cmpq $768,%r8 + jae .L_encrypt_big_nblocks_pdDdEbGtmhbgzzj + +.L_no_more_big_nblocks_pdDdEbGtmhbgzzj: + + cmpq $512,%r8 + jae .L_encrypt_32_blocks_pdDdEbGtmhbgzzj + + cmpq $256,%r8 + jae .L_encrypt_16_blocks_pdDdEbGtmhbgzzj +.L_encrypt_0_blocks_ghash_32_pdDdEbGtmhbgzzj: + movl %r8d,%r10d + andl $~15,%r10d + movl $256,%ebx + subl %r10d,%ebx + vmovdqa64 768(%rsp),%zmm13 + vpxorq %zmm14,%zmm13,%zmm13 + vmovdqu64 0(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 832(%rsp),%zmm13 + vmovdqu64 64(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpxorq %zmm10,%zmm4,%zmm26 + vpxorq %zmm6,%zmm0,%zmm24 + vpxorq %zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 896(%rsp),%zmm13 + vmovdqu64 128(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 960(%rsp),%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + addl $256,%ebx + movl %r8d,%r10d + addl $15,%r10d + shrl $4,%r10d + je .L_last_num_blocks_is_0_ikhdrkemcGbqzad + + cmpl $8,%r10d + je .L_last_num_blocks_is_8_ikhdrkemcGbqzad + jb .L_last_num_blocks_is_7_1_ikhdrkemcGbqzad + + + cmpl $12,%r10d + je .L_last_num_blocks_is_12_ikhdrkemcGbqzad + jb .L_last_num_blocks_is_11_9_ikhdrkemcGbqzad + + + cmpl $15,%r10d + je .L_last_num_blocks_is_15_ikhdrkemcGbqzad + ja .L_last_num_blocks_is_16_ikhdrkemcGbqzad + cmpl $14,%r10d + je .L_last_num_blocks_is_14_ikhdrkemcGbqzad + jmp .L_last_num_blocks_is_13_ikhdrkemcGbqzad + +.L_last_num_blocks_is_11_9_ikhdrkemcGbqzad: + + cmpl $10,%r10d + je .L_last_num_blocks_is_10_ikhdrkemcGbqzad + ja .L_last_num_blocks_is_11_ikhdrkemcGbqzad + jmp .L_last_num_blocks_is_9_ikhdrkemcGbqzad + +.L_last_num_blocks_is_7_1_ikhdrkemcGbqzad: + cmpl $4,%r10d + je .L_last_num_blocks_is_4_ikhdrkemcGbqzad + jb .L_last_num_blocks_is_3_1_ikhdrkemcGbqzad + + cmpl $6,%r10d + ja .L_last_num_blocks_is_7_ikhdrkemcGbqzad + je .L_last_num_blocks_is_6_ikhdrkemcGbqzad + jmp .L_last_num_blocks_is_5_ikhdrkemcGbqzad + +.L_last_num_blocks_is_3_1_ikhdrkemcGbqzad: + + cmpl $2,%r10d + ja .L_last_num_blocks_is_3_ikhdrkemcGbqzad + je .L_last_num_blocks_is_2_ikhdrkemcGbqzad +.L_last_num_blocks_is_1_ikhdrkemcGbqzad: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $255,%r15d + jae .L_16_blocks_overflow_itDorffzaCkryqj + vpaddd %xmm28,%xmm2,%xmm0 + jmp .L_16_blocks_ok_itDorffzaCkryqj + +.L_16_blocks_overflow_itDorffzaCkryqj: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %xmm29,%xmm0,%xmm0 +.L_16_blocks_ok_itDorffzaCkryqj: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z} + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vaesenclast %xmm30,%xmm0,%xmm0 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti32x4 $0,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %xmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %xmm29,%xmm0,%xmm17 + vextracti32x4 $0,%zmm17,%xmm7 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_wcppwgxpbwxBCxm + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_wcppwgxpbwxBCxm +.L_small_initial_partial_block_wcppwgxpbwxBCxm: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm0 + + + vpclmulqdq $0x01,%xmm25,%xmm0,%xmm3 + vpslldq $8,%xmm3,%xmm3 + vpxorq %xmm3,%xmm25,%xmm3 + + + vpclmulqdq $0x00,%xmm3,%xmm0,%xmm4 + vpsrldq $4,%xmm4,%xmm4 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm4,%xmm14 + + + + + + + + + + + + + vpxorq %xmm7,%xmm14,%xmm14 + + jmp .L_after_reduction_wcppwgxpbwxBCxm +.L_small_initial_compute_done_wcppwgxpbwxBCxm: +.L_after_reduction_wcppwgxpbwxBCxm: + jmp .L_last_blocks_done_ikhdrkemcGbqzad +.L_last_num_blocks_is_2_ikhdrkemcGbqzad: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $254,%r15d + jae .L_16_blocks_overflow_udFwtdnCnceudlw + vpaddd %ymm28,%ymm2,%ymm0 + jmp .L_16_blocks_ok_udFwtdnCnceudlw + +.L_16_blocks_overflow_udFwtdnCnceudlw: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %ymm29,%ymm0,%ymm0 +.L_16_blocks_ok_udFwtdnCnceudlw: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z} + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vaesenclast %ymm30,%ymm0,%ymm0 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %ymm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %ymm29,%ymm0,%ymm17 + vextracti32x4 $1,%zmm17,%xmm7 + subq $16 * (2 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_pBaBAiGArbidqBv + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_pBaBAiGArbidqBv +.L_small_initial_partial_block_pBaBAiGArbidqBv: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_pBaBAiGArbidqBv: + + orq %r8,%r8 + je .L_after_reduction_pBaBAiGArbidqBv + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_pBaBAiGArbidqBv: + jmp .L_last_blocks_done_ikhdrkemcGbqzad +.L_last_num_blocks_is_3_ikhdrkemcGbqzad: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $253,%r15d + jae .L_16_blocks_overflow_mnDuevixjjefvof + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_mnDuevixjjefvof + +.L_16_blocks_overflow_mnDuevixjjefvof: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_mnDuevixjjefvof: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $2,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vextracti32x4 $2,%zmm17,%xmm7 + subq $16 * (3 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_yatvknGgscybvGg + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_yatvknGgscybvGg +.L_small_initial_partial_block_yatvknGgscybvGg: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_yatvknGgscybvGg: + + orq %r8,%r8 + je .L_after_reduction_yatvknGgscybvGg + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_yatvknGgscybvGg: + jmp .L_last_blocks_done_ikhdrkemcGbqzad +.L_last_num_blocks_is_4_ikhdrkemcGbqzad: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $252,%r15d + jae .L_16_blocks_overflow_vsajDEszBaAzgFt + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_vsajDEszBaAzgFt + +.L_16_blocks_overflow_vsajDEszBaAzgFt: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_vsajDEszBaAzgFt: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $3,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vextracti32x4 $3,%zmm17,%xmm7 + subq $16 * (4 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_tchAiplfgmzAeEo + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_tchAiplfgmzAeEo +.L_small_initial_partial_block_tchAiplfgmzAeEo: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_tchAiplfgmzAeEo: + + orq %r8,%r8 + je .L_after_reduction_tchAiplfgmzAeEo + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_tchAiplfgmzAeEo: + jmp .L_last_blocks_done_ikhdrkemcGbqzad +.L_last_num_blocks_is_5_ikhdrkemcGbqzad: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $251,%r15d + jae .L_16_blocks_overflow_cxtFqdnzBjmtkGn + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %xmm27,%xmm0,%xmm3 + jmp .L_16_blocks_ok_cxtFqdnzBjmtkGn + +.L_16_blocks_overflow_cxtFqdnzBjmtkGn: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %xmm29,%xmm3,%xmm3 +.L_16_blocks_ok_cxtFqdnzBjmtkGn: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %xmm30,%xmm3,%xmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %xmm19,%xmm3,%xmm3 + vextracti32x4 $0,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %xmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %xmm29,%xmm3,%xmm19 + vextracti32x4 $0,%zmm19,%xmm7 + subq $16 * (5 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_EdeEenqDBtzbplp + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_EdeEenqDBtzbplp +.L_small_initial_partial_block_EdeEenqDBtzbplp: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_EdeEenqDBtzbplp: + + orq %r8,%r8 + je .L_after_reduction_EdeEenqDBtzbplp + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_EdeEenqDBtzbplp: + jmp .L_last_blocks_done_ikhdrkemcGbqzad +.L_last_num_blocks_is_6_ikhdrkemcGbqzad: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $250,%r15d + jae .L_16_blocks_overflow_jwkFAEiBkzxclcz + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %ymm27,%ymm0,%ymm3 + jmp .L_16_blocks_ok_jwkFAEiBkzxclcz + +.L_16_blocks_overflow_jwkFAEiBkzxclcz: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %ymm29,%ymm3,%ymm3 +.L_16_blocks_ok_jwkFAEiBkzxclcz: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %ymm30,%ymm3,%ymm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %ymm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %ymm29,%ymm3,%ymm19 + vextracti32x4 $1,%zmm19,%xmm7 + subq $16 * (6 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_lBhDyvvhkrxyrza + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_lBhDyvvhkrxyrza +.L_small_initial_partial_block_lBhDyvvhkrxyrza: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_lBhDyvvhkrxyrza: + + orq %r8,%r8 + je .L_after_reduction_lBhDyvvhkrxyrza + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_lBhDyvvhkrxyrza: + jmp .L_last_blocks_done_ikhdrkemcGbqzad +.L_last_num_blocks_is_7_ikhdrkemcGbqzad: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $249,%r15d + jae .L_16_blocks_overflow_uGexndlCfdoqjpe + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_uGexndlCfdoqjpe + +.L_16_blocks_overflow_uGexndlCfdoqjpe: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_uGexndlCfdoqjpe: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $2,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vextracti32x4 $2,%zmm19,%xmm7 + subq $16 * (7 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_Bxunmhnvmncxhcy + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_Bxunmhnvmncxhcy +.L_small_initial_partial_block_Bxunmhnvmncxhcy: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_Bxunmhnvmncxhcy: + + orq %r8,%r8 + je .L_after_reduction_Bxunmhnvmncxhcy + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_Bxunmhnvmncxhcy: + jmp .L_last_blocks_done_ikhdrkemcGbqzad +.L_last_num_blocks_is_8_ikhdrkemcGbqzad: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $248,%r15d + jae .L_16_blocks_overflow_vudwsyfxfgECgcf + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_vudwsyfxfgECgcf + +.L_16_blocks_overflow_vudwsyfxfgECgcf: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_vudwsyfxfgECgcf: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $3,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vextracti32x4 $3,%zmm19,%xmm7 + subq $16 * (8 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_rvqyhsdrhoanuka + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_rvqyhsdrhoanuka +.L_small_initial_partial_block_rvqyhsdrhoanuka: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_rvqyhsdrhoanuka: + + orq %r8,%r8 + je .L_after_reduction_rvqyhsdrhoanuka + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_rvqyhsdrhoanuka: + jmp .L_last_blocks_done_ikhdrkemcGbqzad +.L_last_num_blocks_is_9_ikhdrkemcGbqzad: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $247,%r15d + jae .L_16_blocks_overflow_mrBoGdbnxnwlkxC + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %xmm27,%xmm3,%xmm4 + jmp .L_16_blocks_ok_mrBoGdbnxnwlkxC + +.L_16_blocks_overflow_mrBoGdbnxnwlkxC: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %xmm29,%xmm4,%xmm4 +.L_16_blocks_ok_mrBoGdbnxnwlkxC: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %xmm30,%xmm4,%xmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %xmm20,%xmm4,%xmm4 + vextracti32x4 $0,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %xmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %xmm29,%xmm4,%xmm20 + vextracti32x4 $0,%zmm20,%xmm7 + subq $16 * (9 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_tuyribkvmwGnBux + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_tuyribkvmwGnBux +.L_small_initial_partial_block_tuyribkvmwGnBux: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_tuyribkvmwGnBux: + + orq %r8,%r8 + je .L_after_reduction_tuyribkvmwGnBux + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_tuyribkvmwGnBux: + jmp .L_last_blocks_done_ikhdrkemcGbqzad +.L_last_num_blocks_is_10_ikhdrkemcGbqzad: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $246,%r15d + jae .L_16_blocks_overflow_lgaFjCbzqlskvnC + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %ymm27,%ymm3,%ymm4 + jmp .L_16_blocks_ok_lgaFjCbzqlskvnC + +.L_16_blocks_overflow_lgaFjCbzqlskvnC: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %ymm29,%ymm4,%ymm4 +.L_16_blocks_ok_lgaFjCbzqlskvnC: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %ymm30,%ymm4,%ymm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %ymm20,%ymm4,%ymm4 + vextracti32x4 $1,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %ymm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %ymm29,%ymm4,%ymm20 + vextracti32x4 $1,%zmm20,%xmm7 + subq $16 * (10 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_doFvvyygahavAuD + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_doFvvyygahavAuD +.L_small_initial_partial_block_doFvvyygahavAuD: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_doFvvyygahavAuD: + + orq %r8,%r8 + je .L_after_reduction_doFvvyygahavAuD + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_doFvvyygahavAuD: + jmp .L_last_blocks_done_ikhdrkemcGbqzad +.L_last_num_blocks_is_11_ikhdrkemcGbqzad: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $245,%r15d + jae .L_16_blocks_overflow_wnveeoCoFhnAsjr + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_wnveeoCoFhnAsjr + +.L_16_blocks_overflow_wnveeoCoFhnAsjr: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_wnveeoCoFhnAsjr: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $2,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vextracti32x4 $2,%zmm20,%xmm7 + subq $16 * (11 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_okdqxckEysfDiGw + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_okdqxckEysfDiGw +.L_small_initial_partial_block_okdqxckEysfDiGw: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_okdqxckEysfDiGw: + + orq %r8,%r8 + je .L_after_reduction_okdqxckEysfDiGw + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_okdqxckEysfDiGw: + jmp .L_last_blocks_done_ikhdrkemcGbqzad +.L_last_num_blocks_is_12_ikhdrkemcGbqzad: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $244,%r15d + jae .L_16_blocks_overflow_aeCekhphkkfCGlp + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_aeCekhphkkfCGlp + +.L_16_blocks_overflow_aeCekhphkkfCGlp: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_aeCekhphkkfCGlp: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $3,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vextracti32x4 $3,%zmm20,%xmm7 + subq $16 * (12 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_tAjudiknsDunngB + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_tAjudiknsDunngB +.L_small_initial_partial_block_tAjudiknsDunngB: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_tAjudiknsDunngB: + + orq %r8,%r8 + je .L_after_reduction_tAjudiknsDunngB + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_tAjudiknsDunngB: + jmp .L_last_blocks_done_ikhdrkemcGbqzad +.L_last_num_blocks_is_13_ikhdrkemcGbqzad: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $243,%r15d + jae .L_16_blocks_overflow_vFhoejiyDCGCfdw + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %xmm27,%xmm4,%xmm5 + jmp .L_16_blocks_ok_vFhoejiyDCGCfdw + +.L_16_blocks_overflow_vFhoejiyDCGCfdw: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %xmm29,%xmm5,%xmm5 +.L_16_blocks_ok_vFhoejiyDCGCfdw: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %xmm30,%xmm5,%xmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %xmm21,%xmm5,%xmm5 + vextracti32x4 $0,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %xmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %xmm29,%xmm5,%xmm21 + vextracti32x4 $0,%zmm21,%xmm7 + subq $16 * (13 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_svrobwfwdbaDnCx + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_svrobwfwdbaDnCx +.L_small_initial_partial_block_svrobwfwdbaDnCx: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_svrobwfwdbaDnCx: + + orq %r8,%r8 + je .L_after_reduction_svrobwfwdbaDnCx + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_svrobwfwdbaDnCx: + jmp .L_last_blocks_done_ikhdrkemcGbqzad +.L_last_num_blocks_is_14_ikhdrkemcGbqzad: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $242,%r15d + jae .L_16_blocks_overflow_hgwwfomjsnxunhr + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %ymm27,%ymm4,%ymm5 + jmp .L_16_blocks_ok_hgwwfomjsnxunhr + +.L_16_blocks_overflow_hgwwfomjsnxunhr: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %ymm29,%ymm5,%ymm5 +.L_16_blocks_ok_hgwwfomjsnxunhr: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %ymm30,%ymm5,%ymm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %ymm21,%ymm5,%ymm5 + vextracti32x4 $1,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %ymm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %ymm29,%ymm5,%ymm21 + vextracti32x4 $1,%zmm21,%xmm7 + subq $16 * (14 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_seAkuxixhdBEdfz + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_seAkuxixhdBEdfz +.L_small_initial_partial_block_seAkuxixhdBEdfz: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_seAkuxixhdBEdfz: + + orq %r8,%r8 + je .L_after_reduction_seAkuxixhdBEdfz + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_seAkuxixhdBEdfz: + jmp .L_last_blocks_done_ikhdrkemcGbqzad +.L_last_num_blocks_is_15_ikhdrkemcGbqzad: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $241,%r15d + jae .L_16_blocks_overflow_wbagfdFdigxytjj + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_wbagfdFdigxytjj + +.L_16_blocks_overflow_wbagfdFdigxytjj: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_wbagfdFdigxytjj: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $2,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %zmm29,%zmm5,%zmm21 + vextracti32x4 $2,%zmm21,%xmm7 + subq $16 * (15 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_ChmDFBmjkjBuetv + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_ChmDFBmjkjBuetv +.L_small_initial_partial_block_ChmDFBmjkjBuetv: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_ChmDFBmjkjBuetv: + + orq %r8,%r8 + je .L_after_reduction_ChmDFBmjkjBuetv + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_ChmDFBmjkjBuetv: + jmp .L_last_blocks_done_ikhdrkemcGbqzad +.L_last_num_blocks_is_16_ikhdrkemcGbqzad: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $240,%r15d + jae .L_16_blocks_overflow_dkuzxAGzynhzFCe + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_dkuzxAGzynhzFCe + +.L_16_blocks_overflow_dkuzxAGzynhzFCe: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_dkuzxAGzynhzFCe: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $3,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %zmm29,%zmm5,%zmm21 + vextracti32x4 $3,%zmm21,%xmm7 + subq $16 * (16 - 1),%r8 +.L_small_initial_partial_block_vtbrvsizdbGzbGo: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_vtbrvsizdbGzbGo: + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_vtbrvsizdbGzbGo: + jmp .L_last_blocks_done_ikhdrkemcGbqzad +.L_last_num_blocks_is_0_ikhdrkemcGbqzad: + vmovdqa64 1024(%rsp),%zmm13 + vmovdqu64 0(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1088(%rsp),%zmm13 + vmovdqu64 64(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 1152(%rsp),%zmm13 + vmovdqu64 128(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1216(%rsp),%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm4 + + + vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0 + vpslldq $8,%xmm0,%xmm0 + vpxorq %xmm0,%xmm25,%xmm0 + + + vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3 + vpsrldq $4,%xmm3,%xmm3 + vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm3,%xmm14 + +.L_last_blocks_done_ikhdrkemcGbqzad: + vpshufb %xmm29,%xmm2,%xmm2 + jmp .L_ghash_done_pdDdEbGtmhbgzzj +.L_encrypt_32_blocks_pdDdEbGtmhbgzzj: + cmpb $240,%r15b + jae .L_16_blocks_overflow_DpBiAfvjdcateGm + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_DpBiAfvjdcateGm +.L_16_blocks_overflow_DpBiAfvjdcateGm: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_DpBiAfvjdcateGm: + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp),%zmm1 + + + + + vshufi64x2 $255,%zmm5,%zmm5,%zmm2 + addb $16,%r15b + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + + + + + + + + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm6 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + + + + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm15,%zmm10,%zmm26 + vpxorq %zmm12,%zmm6,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + + + + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + + + + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1) + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 + vmovdqa64 %zmm0,1280(%rsp) + vmovdqa64 %zmm3,1344(%rsp) + vmovdqa64 %zmm4,1408(%rsp) + vmovdqa64 %zmm5,1472(%rsp) + cmpb $240,%r15b + jae .L_16_blocks_overflow_pnochsioawayaBr + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_pnochsioawayaBr +.L_16_blocks_overflow_pnochsioawayaBr: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_pnochsioawayaBr: + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 256(%rsp),%zmm1 + + + + + vshufi64x2 $255,%zmm5,%zmm5,%zmm2 + addb $16,%r15b + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 320(%rsp),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + + + + + + + + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 384(%rsp),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 448(%rsp),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm6 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + + + + vmovdqu8 256(%rcx,%r11,1),%zmm17 + vmovdqu8 320(%rcx,%r11,1),%zmm19 + vmovdqu8 384(%rcx,%r11,1),%zmm20 + vmovdqu8 448(%rcx,%r11,1),%zmm21 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vpternlogq $0x96,%zmm12,%zmm6,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + + + + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + + + + movq %r9,%r10 + vmovdqu8 %zmm0,256(%r10,%r11,1) + vmovdqu8 %zmm3,320(%r10,%r11,1) + vmovdqu8 %zmm4,384(%r10,%r11,1) + vmovdqu8 %zmm5,448(%r10,%r11,1) + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 + vmovdqa64 %zmm0,768(%rsp) + vmovdqa64 %zmm3,832(%rsp) + vmovdqa64 %zmm4,896(%rsp) + vmovdqa64 %zmm5,960(%rsp) + vmovdqa64 1280(%rsp),%zmm13 + vmovdqu64 512(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1344(%rsp),%zmm13 + vmovdqu64 576(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 1408(%rsp),%zmm13 + vmovdqu64 640(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1472(%rsp),%zmm13 + vmovdqu64 704(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm4 + + + vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0 + vpslldq $8,%xmm0,%xmm0 + vpxorq %xmm0,%xmm25,%xmm0 + + + vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3 + vpsrldq $4,%xmm3,%xmm3 + vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm3,%xmm14 + + subq $512,%r8 + addq $512,%r11 + movl %r8d,%r10d + andl $~15,%r10d + movl $512,%ebx + subl %r10d,%ebx + movl %r8d,%r10d + addl $15,%r10d + shrl $4,%r10d + je .L_last_num_blocks_is_0_nqBvobwmcxocojb + + cmpl $8,%r10d + je .L_last_num_blocks_is_8_nqBvobwmcxocojb + jb .L_last_num_blocks_is_7_1_nqBvobwmcxocojb + + + cmpl $12,%r10d + je .L_last_num_blocks_is_12_nqBvobwmcxocojb + jb .L_last_num_blocks_is_11_9_nqBvobwmcxocojb + + + cmpl $15,%r10d + je .L_last_num_blocks_is_15_nqBvobwmcxocojb + ja .L_last_num_blocks_is_16_nqBvobwmcxocojb + cmpl $14,%r10d + je .L_last_num_blocks_is_14_nqBvobwmcxocojb + jmp .L_last_num_blocks_is_13_nqBvobwmcxocojb + +.L_last_num_blocks_is_11_9_nqBvobwmcxocojb: + + cmpl $10,%r10d + je .L_last_num_blocks_is_10_nqBvobwmcxocojb + ja .L_last_num_blocks_is_11_nqBvobwmcxocojb + jmp .L_last_num_blocks_is_9_nqBvobwmcxocojb + +.L_last_num_blocks_is_7_1_nqBvobwmcxocojb: + cmpl $4,%r10d + je .L_last_num_blocks_is_4_nqBvobwmcxocojb + jb .L_last_num_blocks_is_3_1_nqBvobwmcxocojb + + cmpl $6,%r10d + ja .L_last_num_blocks_is_7_nqBvobwmcxocojb + je .L_last_num_blocks_is_6_nqBvobwmcxocojb + jmp .L_last_num_blocks_is_5_nqBvobwmcxocojb + +.L_last_num_blocks_is_3_1_nqBvobwmcxocojb: + + cmpl $2,%r10d + ja .L_last_num_blocks_is_3_nqBvobwmcxocojb + je .L_last_num_blocks_is_2_nqBvobwmcxocojb +.L_last_num_blocks_is_1_nqBvobwmcxocojb: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $255,%r15d + jae .L_16_blocks_overflow_iGlCGEwegGzFhtA + vpaddd %xmm28,%xmm2,%xmm0 + jmp .L_16_blocks_ok_iGlCGEwegGzFhtA + +.L_16_blocks_overflow_iGlCGEwegGzFhtA: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %xmm29,%xmm0,%xmm0 +.L_16_blocks_ok_iGlCGEwegGzFhtA: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z} + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vaesenclast %xmm30,%xmm0,%xmm0 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti32x4 $0,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %xmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %xmm29,%xmm0,%xmm17 + vextracti32x4 $0,%zmm17,%xmm7 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_hFBzlBjpABAteEq + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_hFBzlBjpABAteEq +.L_small_initial_partial_block_hFBzlBjpABAteEq: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm0 + + + vpclmulqdq $0x01,%xmm25,%xmm0,%xmm3 + vpslldq $8,%xmm3,%xmm3 + vpxorq %xmm3,%xmm25,%xmm3 + + + vpclmulqdq $0x00,%xmm3,%xmm0,%xmm4 + vpsrldq $4,%xmm4,%xmm4 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm4,%xmm14 + + + + + + + + + + + + + vpxorq %xmm7,%xmm14,%xmm14 + + jmp .L_after_reduction_hFBzlBjpABAteEq +.L_small_initial_compute_done_hFBzlBjpABAteEq: +.L_after_reduction_hFBzlBjpABAteEq: + jmp .L_last_blocks_done_nqBvobwmcxocojb +.L_last_num_blocks_is_2_nqBvobwmcxocojb: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $254,%r15d + jae .L_16_blocks_overflow_BwDxojfsymCmEeo + vpaddd %ymm28,%ymm2,%ymm0 + jmp .L_16_blocks_ok_BwDxojfsymCmEeo + +.L_16_blocks_overflow_BwDxojfsymCmEeo: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %ymm29,%ymm0,%ymm0 +.L_16_blocks_ok_BwDxojfsymCmEeo: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z} + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vaesenclast %ymm30,%ymm0,%ymm0 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %ymm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %ymm29,%ymm0,%ymm17 + vextracti32x4 $1,%zmm17,%xmm7 + subq $16 * (2 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_ujnyckFGoBmGvAD + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_ujnyckFGoBmGvAD +.L_small_initial_partial_block_ujnyckFGoBmGvAD: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_ujnyckFGoBmGvAD: + + orq %r8,%r8 + je .L_after_reduction_ujnyckFGoBmGvAD + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_ujnyckFGoBmGvAD: + jmp .L_last_blocks_done_nqBvobwmcxocojb +.L_last_num_blocks_is_3_nqBvobwmcxocojb: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $253,%r15d + jae .L_16_blocks_overflow_ArGalqGfmEgtzdC + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_ArGalqGfmEgtzdC + +.L_16_blocks_overflow_ArGalqGfmEgtzdC: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_ArGalqGfmEgtzdC: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $2,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vextracti32x4 $2,%zmm17,%xmm7 + subq $16 * (3 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_tlDwADlnmmFjwlt + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_tlDwADlnmmFjwlt +.L_small_initial_partial_block_tlDwADlnmmFjwlt: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_tlDwADlnmmFjwlt: + + orq %r8,%r8 + je .L_after_reduction_tlDwADlnmmFjwlt + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_tlDwADlnmmFjwlt: + jmp .L_last_blocks_done_nqBvobwmcxocojb +.L_last_num_blocks_is_4_nqBvobwmcxocojb: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $252,%r15d + jae .L_16_blocks_overflow_eiFwyntDmEqyCDx + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_eiFwyntDmEqyCDx + +.L_16_blocks_overflow_eiFwyntDmEqyCDx: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_eiFwyntDmEqyCDx: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $3,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vextracti32x4 $3,%zmm17,%xmm7 + subq $16 * (4 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_zAosBwqfDyjcdyb + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_zAosBwqfDyjcdyb +.L_small_initial_partial_block_zAosBwqfDyjcdyb: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_zAosBwqfDyjcdyb: + + orq %r8,%r8 + je .L_after_reduction_zAosBwqfDyjcdyb + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_zAosBwqfDyjcdyb: + jmp .L_last_blocks_done_nqBvobwmcxocojb +.L_last_num_blocks_is_5_nqBvobwmcxocojb: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $251,%r15d + jae .L_16_blocks_overflow_bAoFucDcpblzDdt + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %xmm27,%xmm0,%xmm3 + jmp .L_16_blocks_ok_bAoFucDcpblzDdt + +.L_16_blocks_overflow_bAoFucDcpblzDdt: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %xmm29,%xmm3,%xmm3 +.L_16_blocks_ok_bAoFucDcpblzDdt: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %xmm30,%xmm3,%xmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %xmm19,%xmm3,%xmm3 + vextracti32x4 $0,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %xmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %xmm29,%xmm3,%xmm19 + vextracti32x4 $0,%zmm19,%xmm7 + subq $16 * (5 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_icuaypakFrCovoy + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_icuaypakFrCovoy +.L_small_initial_partial_block_icuaypakFrCovoy: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_icuaypakFrCovoy: + + orq %r8,%r8 + je .L_after_reduction_icuaypakFrCovoy + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_icuaypakFrCovoy: + jmp .L_last_blocks_done_nqBvobwmcxocojb +.L_last_num_blocks_is_6_nqBvobwmcxocojb: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $250,%r15d + jae .L_16_blocks_overflow_nBxnDvEEtcfmmpA + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %ymm27,%ymm0,%ymm3 + jmp .L_16_blocks_ok_nBxnDvEEtcfmmpA + +.L_16_blocks_overflow_nBxnDvEEtcfmmpA: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %ymm29,%ymm3,%ymm3 +.L_16_blocks_ok_nBxnDvEEtcfmmpA: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %ymm30,%ymm3,%ymm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %ymm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %ymm29,%ymm3,%ymm19 + vextracti32x4 $1,%zmm19,%xmm7 + subq $16 * (6 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_oBDgqvmqflGBdts + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_oBDgqvmqflGBdts +.L_small_initial_partial_block_oBDgqvmqflGBdts: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_oBDgqvmqflGBdts: + + orq %r8,%r8 + je .L_after_reduction_oBDgqvmqflGBdts + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_oBDgqvmqflGBdts: + jmp .L_last_blocks_done_nqBvobwmcxocojb +.L_last_num_blocks_is_7_nqBvobwmcxocojb: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $249,%r15d + jae .L_16_blocks_overflow_ktiEwgDjzbqnlgA + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_ktiEwgDjzbqnlgA + +.L_16_blocks_overflow_ktiEwgDjzbqnlgA: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_ktiEwgDjzbqnlgA: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $2,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vextracti32x4 $2,%zmm19,%xmm7 + subq $16 * (7 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_rhqzwAqatoAowvt + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_rhqzwAqatoAowvt +.L_small_initial_partial_block_rhqzwAqatoAowvt: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_rhqzwAqatoAowvt: + + orq %r8,%r8 + je .L_after_reduction_rhqzwAqatoAowvt + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_rhqzwAqatoAowvt: + jmp .L_last_blocks_done_nqBvobwmcxocojb +.L_last_num_blocks_is_8_nqBvobwmcxocojb: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $248,%r15d + jae .L_16_blocks_overflow_ppdpbjvaqFskcDy + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_ppdpbjvaqFskcDy + +.L_16_blocks_overflow_ppdpbjvaqFskcDy: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_ppdpbjvaqFskcDy: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $3,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vextracti32x4 $3,%zmm19,%xmm7 + subq $16 * (8 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_hghryxmwctxcEsx + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_hghryxmwctxcEsx +.L_small_initial_partial_block_hghryxmwctxcEsx: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_hghryxmwctxcEsx: + + orq %r8,%r8 + je .L_after_reduction_hghryxmwctxcEsx + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_hghryxmwctxcEsx: + jmp .L_last_blocks_done_nqBvobwmcxocojb +.L_last_num_blocks_is_9_nqBvobwmcxocojb: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $247,%r15d + jae .L_16_blocks_overflow_ssqyutccxCiqEfp + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %xmm27,%xmm3,%xmm4 + jmp .L_16_blocks_ok_ssqyutccxCiqEfp + +.L_16_blocks_overflow_ssqyutccxCiqEfp: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %xmm29,%xmm4,%xmm4 +.L_16_blocks_ok_ssqyutccxCiqEfp: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %xmm30,%xmm4,%xmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %xmm20,%xmm4,%xmm4 + vextracti32x4 $0,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %xmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %xmm29,%xmm4,%xmm20 + vextracti32x4 $0,%zmm20,%xmm7 + subq $16 * (9 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_dkgcmoCccqwinCj + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_dkgcmoCccqwinCj +.L_small_initial_partial_block_dkgcmoCccqwinCj: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_dkgcmoCccqwinCj: + + orq %r8,%r8 + je .L_after_reduction_dkgcmoCccqwinCj + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_dkgcmoCccqwinCj: + jmp .L_last_blocks_done_nqBvobwmcxocojb +.L_last_num_blocks_is_10_nqBvobwmcxocojb: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $246,%r15d + jae .L_16_blocks_overflow_qrrfwGAzztwabql + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %ymm27,%ymm3,%ymm4 + jmp .L_16_blocks_ok_qrrfwGAzztwabql + +.L_16_blocks_overflow_qrrfwGAzztwabql: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %ymm29,%ymm4,%ymm4 +.L_16_blocks_ok_qrrfwGAzztwabql: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %ymm30,%ymm4,%ymm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %ymm20,%ymm4,%ymm4 + vextracti32x4 $1,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %ymm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %ymm29,%ymm4,%ymm20 + vextracti32x4 $1,%zmm20,%xmm7 + subq $16 * (10 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_ioCDffAzuDvuFmD + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_ioCDffAzuDvuFmD +.L_small_initial_partial_block_ioCDffAzuDvuFmD: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_ioCDffAzuDvuFmD: + + orq %r8,%r8 + je .L_after_reduction_ioCDffAzuDvuFmD + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_ioCDffAzuDvuFmD: + jmp .L_last_blocks_done_nqBvobwmcxocojb +.L_last_num_blocks_is_11_nqBvobwmcxocojb: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $245,%r15d + jae .L_16_blocks_overflow_BFnbwbbsiwGDDCn + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_BFnbwbbsiwGDDCn + +.L_16_blocks_overflow_BFnbwbbsiwGDDCn: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_BFnbwbbsiwGDDCn: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $2,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vextracti32x4 $2,%zmm20,%xmm7 + subq $16 * (11 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_cCoGeiFGozAwFew + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_cCoGeiFGozAwFew +.L_small_initial_partial_block_cCoGeiFGozAwFew: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_cCoGeiFGozAwFew: + + orq %r8,%r8 + je .L_after_reduction_cCoGeiFGozAwFew + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_cCoGeiFGozAwFew: + jmp .L_last_blocks_done_nqBvobwmcxocojb +.L_last_num_blocks_is_12_nqBvobwmcxocojb: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $244,%r15d + jae .L_16_blocks_overflow_haBiqFbjgxpdzpn + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_haBiqFbjgxpdzpn + +.L_16_blocks_overflow_haBiqFbjgxpdzpn: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_haBiqFbjgxpdzpn: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $3,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vextracti32x4 $3,%zmm20,%xmm7 + subq $16 * (12 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_nhbrtEjyiFhswCq + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_nhbrtEjyiFhswCq +.L_small_initial_partial_block_nhbrtEjyiFhswCq: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_nhbrtEjyiFhswCq: + + orq %r8,%r8 + je .L_after_reduction_nhbrtEjyiFhswCq + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_nhbrtEjyiFhswCq: + jmp .L_last_blocks_done_nqBvobwmcxocojb +.L_last_num_blocks_is_13_nqBvobwmcxocojb: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $243,%r15d + jae .L_16_blocks_overflow_aDaGBFBAaojGGGj + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %xmm27,%xmm4,%xmm5 + jmp .L_16_blocks_ok_aDaGBFBAaojGGGj + +.L_16_blocks_overflow_aDaGBFBAaojGGGj: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %xmm29,%xmm5,%xmm5 +.L_16_blocks_ok_aDaGBFBAaojGGGj: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %xmm30,%xmm5,%xmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %xmm21,%xmm5,%xmm5 + vextracti32x4 $0,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %xmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %xmm29,%xmm5,%xmm21 + vextracti32x4 $0,%zmm21,%xmm7 + subq $16 * (13 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_mozkzBtivrcvtEk + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_mozkzBtivrcvtEk +.L_small_initial_partial_block_mozkzBtivrcvtEk: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_mozkzBtivrcvtEk: + + orq %r8,%r8 + je .L_after_reduction_mozkzBtivrcvtEk + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_mozkzBtivrcvtEk: + jmp .L_last_blocks_done_nqBvobwmcxocojb +.L_last_num_blocks_is_14_nqBvobwmcxocojb: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $242,%r15d + jae .L_16_blocks_overflow_tAnEojledvrxyjr + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %ymm27,%ymm4,%ymm5 + jmp .L_16_blocks_ok_tAnEojledvrxyjr + +.L_16_blocks_overflow_tAnEojledvrxyjr: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %ymm29,%ymm5,%ymm5 +.L_16_blocks_ok_tAnEojledvrxyjr: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %ymm30,%ymm5,%ymm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %ymm21,%ymm5,%ymm5 + vextracti32x4 $1,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %ymm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %ymm29,%ymm5,%ymm21 + vextracti32x4 $1,%zmm21,%xmm7 + subq $16 * (14 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_FdkjoDukspwasBA + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_FdkjoDukspwasBA +.L_small_initial_partial_block_FdkjoDukspwasBA: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_FdkjoDukspwasBA: + + orq %r8,%r8 + je .L_after_reduction_FdkjoDukspwasBA + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_FdkjoDukspwasBA: + jmp .L_last_blocks_done_nqBvobwmcxocojb +.L_last_num_blocks_is_15_nqBvobwmcxocojb: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $241,%r15d + jae .L_16_blocks_overflow_EocAcwAEiGzmbor + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_EocAcwAEiGzmbor + +.L_16_blocks_overflow_EocAcwAEiGzmbor: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_EocAcwAEiGzmbor: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $2,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %zmm29,%zmm5,%zmm21 + vextracti32x4 $2,%zmm21,%xmm7 + subq $16 * (15 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_ioeijxfuGydnlim + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_ioeijxfuGydnlim +.L_small_initial_partial_block_ioeijxfuGydnlim: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_ioeijxfuGydnlim: + + orq %r8,%r8 + je .L_after_reduction_ioeijxfuGydnlim + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_ioeijxfuGydnlim: + jmp .L_last_blocks_done_nqBvobwmcxocojb +.L_last_num_blocks_is_16_nqBvobwmcxocojb: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $240,%r15d + jae .L_16_blocks_overflow_uDqoqnyAqaujFth + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_uDqoqnyAqaujFth + +.L_16_blocks_overflow_uDqoqnyAqaujFth: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_uDqoqnyAqaujFth: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $3,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %zmm29,%zmm5,%zmm21 + vextracti32x4 $3,%zmm21,%xmm7 + subq $16 * (16 - 1),%r8 +.L_small_initial_partial_block_rpjttlmmCtxqtrD: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_rpjttlmmCtxqtrD: + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_rpjttlmmCtxqtrD: + jmp .L_last_blocks_done_nqBvobwmcxocojb +.L_last_num_blocks_is_0_nqBvobwmcxocojb: + vmovdqa64 768(%rsp),%zmm13 + vpxorq %zmm14,%zmm13,%zmm13 + vmovdqu64 0(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 832(%rsp),%zmm13 + vmovdqu64 64(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpxorq %zmm10,%zmm4,%zmm26 + vpxorq %zmm6,%zmm0,%zmm24 + vpxorq %zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 896(%rsp),%zmm13 + vmovdqu64 128(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 960(%rsp),%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm4 + + + vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0 + vpslldq $8,%xmm0,%xmm0 + vpxorq %xmm0,%xmm25,%xmm0 + + + vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3 + vpsrldq $4,%xmm3,%xmm3 + vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm3,%xmm14 + +.L_last_blocks_done_nqBvobwmcxocojb: + vpshufb %xmm29,%xmm2,%xmm2 + jmp .L_ghash_done_pdDdEbGtmhbgzzj +.L_encrypt_16_blocks_pdDdEbGtmhbgzzj: + cmpb $240,%r15b + jae .L_16_blocks_overflow_mlfnqsfcdbpAAfz + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_mlfnqsfcdbpAAfz +.L_16_blocks_overflow_mlfnqsfcdbpAAfz: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_mlfnqsfcdbpAAfz: + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp),%zmm1 + + + + + vshufi64x2 $255,%zmm5,%zmm5,%zmm2 + addb $16,%r15b + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + + + + + + + + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm6 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + + + + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm15,%zmm10,%zmm26 + vpxorq %zmm12,%zmm6,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + + + + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + + + + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1) + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 + vmovdqa64 %zmm0,1280(%rsp) + vmovdqa64 %zmm3,1344(%rsp) + vmovdqa64 %zmm4,1408(%rsp) + vmovdqa64 %zmm5,1472(%rsp) + vmovdqa64 1024(%rsp),%zmm13 + vmovdqu64 256(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1088(%rsp),%zmm13 + vmovdqu64 320(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 1152(%rsp),%zmm13 + vmovdqu64 384(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1216(%rsp),%zmm13 + vmovdqu64 448(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + subq $256,%r8 + addq $256,%r11 + movl %r8d,%r10d + addl $15,%r10d + shrl $4,%r10d + je .L_last_num_blocks_is_0_hommwsmBDghhsCD + + cmpl $8,%r10d + je .L_last_num_blocks_is_8_hommwsmBDghhsCD + jb .L_last_num_blocks_is_7_1_hommwsmBDghhsCD + + + cmpl $12,%r10d + je .L_last_num_blocks_is_12_hommwsmBDghhsCD + jb .L_last_num_blocks_is_11_9_hommwsmBDghhsCD + + + cmpl $15,%r10d + je .L_last_num_blocks_is_15_hommwsmBDghhsCD + ja .L_last_num_blocks_is_16_hommwsmBDghhsCD + cmpl $14,%r10d + je .L_last_num_blocks_is_14_hommwsmBDghhsCD + jmp .L_last_num_blocks_is_13_hommwsmBDghhsCD + +.L_last_num_blocks_is_11_9_hommwsmBDghhsCD: + + cmpl $10,%r10d + je .L_last_num_blocks_is_10_hommwsmBDghhsCD + ja .L_last_num_blocks_is_11_hommwsmBDghhsCD + jmp .L_last_num_blocks_is_9_hommwsmBDghhsCD + +.L_last_num_blocks_is_7_1_hommwsmBDghhsCD: + cmpl $4,%r10d + je .L_last_num_blocks_is_4_hommwsmBDghhsCD + jb .L_last_num_blocks_is_3_1_hommwsmBDghhsCD + + cmpl $6,%r10d + ja .L_last_num_blocks_is_7_hommwsmBDghhsCD + je .L_last_num_blocks_is_6_hommwsmBDghhsCD + jmp .L_last_num_blocks_is_5_hommwsmBDghhsCD + +.L_last_num_blocks_is_3_1_hommwsmBDghhsCD: + + cmpl $2,%r10d + ja .L_last_num_blocks_is_3_hommwsmBDghhsCD + je .L_last_num_blocks_is_2_hommwsmBDghhsCD +.L_last_num_blocks_is_1_hommwsmBDghhsCD: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $255,%r15d + jae .L_16_blocks_overflow_mgEtuxommfhprEy + vpaddd %xmm28,%xmm2,%xmm0 + jmp .L_16_blocks_ok_mgEtuxommfhprEy + +.L_16_blocks_overflow_mgEtuxommfhprEy: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %xmm29,%xmm0,%xmm0 +.L_16_blocks_ok_mgEtuxommfhprEy: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $0,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z} + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %xmm31,%xmm0,%xmm0 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %xmm30,%xmm0,%xmm0 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti32x4 $0,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %xmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %xmm29,%xmm0,%xmm17 + vextracti32x4 $0,%zmm17,%xmm7 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_hmAEtdvbxtuofqt + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_hmAEtdvbxtuofqt +.L_small_initial_partial_block_hmAEtdvbxtuofqt: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + + + + + + + + + + + + vpxorq %xmm7,%xmm14,%xmm14 + + jmp .L_after_reduction_hmAEtdvbxtuofqt +.L_small_initial_compute_done_hmAEtdvbxtuofqt: +.L_after_reduction_hmAEtdvbxtuofqt: + jmp .L_last_blocks_done_hommwsmBDghhsCD +.L_last_num_blocks_is_2_hommwsmBDghhsCD: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $254,%r15d + jae .L_16_blocks_overflow_eunligEgprqxzEB + vpaddd %ymm28,%ymm2,%ymm0 + jmp .L_16_blocks_ok_eunligEgprqxzEB + +.L_16_blocks_overflow_eunligEgprqxzEB: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %ymm29,%ymm0,%ymm0 +.L_16_blocks_ok_eunligEgprqxzEB: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $1,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z} + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %ymm31,%ymm0,%ymm0 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %ymm30,%ymm0,%ymm0 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %ymm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %ymm29,%ymm0,%ymm17 + vextracti32x4 $1,%zmm17,%xmm7 + subq $16 * (2 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_CpCtmyiCpxeyqBF + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_CpCtmyiCpxeyqBF +.L_small_initial_partial_block_CpCtmyiCpxeyqBF: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_CpCtmyiCpxeyqBF: + + orq %r8,%r8 + je .L_after_reduction_CpCtmyiCpxeyqBF + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_CpCtmyiCpxeyqBF: + jmp .L_last_blocks_done_hommwsmBDghhsCD +.L_last_num_blocks_is_3_hommwsmBDghhsCD: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $253,%r15d + jae .L_16_blocks_overflow_tCygkraciCitCxE + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_tCygkraciCitCxE + +.L_16_blocks_overflow_tCygkraciCitCxE: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_tCygkraciCitCxE: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $2,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $2,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vextracti32x4 $2,%zmm17,%xmm7 + subq $16 * (3 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_oscyleCtgoefssq + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_oscyleCtgoefssq +.L_small_initial_partial_block_oscyleCtgoefssq: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_oscyleCtgoefssq: + + orq %r8,%r8 + je .L_after_reduction_oscyleCtgoefssq + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_oscyleCtgoefssq: + jmp .L_last_blocks_done_hommwsmBDghhsCD +.L_last_num_blocks_is_4_hommwsmBDghhsCD: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $252,%r15d + jae .L_16_blocks_overflow_nkuGqpqvsuAfkpy + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_nkuGqpqvsuAfkpy + +.L_16_blocks_overflow_nkuGqpqvsuAfkpy: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_nkuGqpqvsuAfkpy: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $3,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $3,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vextracti32x4 $3,%zmm17,%xmm7 + subq $16 * (4 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_bszjeCzlpihayrq + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_bszjeCzlpihayrq +.L_small_initial_partial_block_bszjeCzlpihayrq: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_bszjeCzlpihayrq: + + orq %r8,%r8 + je .L_after_reduction_bszjeCzlpihayrq + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_bszjeCzlpihayrq: + jmp .L_last_blocks_done_hommwsmBDghhsCD +.L_last_num_blocks_is_5_hommwsmBDghhsCD: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $251,%r15d + jae .L_16_blocks_overflow_yBohCFkvcahhcEE + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %xmm27,%xmm0,%xmm3 + jmp .L_16_blocks_ok_yBohCFkvcahhcEE + +.L_16_blocks_overflow_yBohCFkvcahhcEE: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %xmm29,%xmm3,%xmm3 +.L_16_blocks_ok_yBohCFkvcahhcEE: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $0,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %xmm30,%xmm3,%xmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %xmm19,%xmm3,%xmm3 + vextracti32x4 $0,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %xmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %xmm29,%xmm3,%xmm19 + vextracti32x4 $0,%zmm19,%xmm7 + subq $16 * (5 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_peyrCumyCvjyexD + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_peyrCumyCvjyexD +.L_small_initial_partial_block_peyrCumyCvjyexD: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_peyrCumyCvjyexD: + + orq %r8,%r8 + je .L_after_reduction_peyrCumyCvjyexD + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_peyrCumyCvjyexD: + jmp .L_last_blocks_done_hommwsmBDghhsCD +.L_last_num_blocks_is_6_hommwsmBDghhsCD: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $250,%r15d + jae .L_16_blocks_overflow_efCkGsdFqsctEDl + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %ymm27,%ymm0,%ymm3 + jmp .L_16_blocks_ok_efCkGsdFqsctEDl + +.L_16_blocks_overflow_efCkGsdFqsctEDl: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %ymm29,%ymm3,%ymm3 +.L_16_blocks_ok_efCkGsdFqsctEDl: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $1,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %ymm30,%ymm3,%ymm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %ymm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %ymm29,%ymm3,%ymm19 + vextracti32x4 $1,%zmm19,%xmm7 + subq $16 * (6 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_nolBDipDBhtrDmb + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_nolBDipDBhtrDmb +.L_small_initial_partial_block_nolBDipDBhtrDmb: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_nolBDipDBhtrDmb: + + orq %r8,%r8 + je .L_after_reduction_nolBDipDBhtrDmb + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_nolBDipDBhtrDmb: + jmp .L_last_blocks_done_hommwsmBDghhsCD +.L_last_num_blocks_is_7_hommwsmBDghhsCD: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $249,%r15d + jae .L_16_blocks_overflow_uGpnccromgjsdor + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_uGpnccromgjsdor + +.L_16_blocks_overflow_uGpnccromgjsdor: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_uGpnccromgjsdor: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $2,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $2,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vextracti32x4 $2,%zmm19,%xmm7 + subq $16 * (7 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_wFFpDbecxxomBhl + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_wFFpDbecxxomBhl +.L_small_initial_partial_block_wFFpDbecxxomBhl: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_wFFpDbecxxomBhl: + + orq %r8,%r8 + je .L_after_reduction_wFFpDbecxxomBhl + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_wFFpDbecxxomBhl: + jmp .L_last_blocks_done_hommwsmBDghhsCD +.L_last_num_blocks_is_8_hommwsmBDghhsCD: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $248,%r15d + jae .L_16_blocks_overflow_rCxvxGCqotFabFi + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_rCxvxGCqotFabFi + +.L_16_blocks_overflow_rCxvxGCqotFabFi: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_rCxvxGCqotFabFi: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $3,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $3,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vextracti32x4 $3,%zmm19,%xmm7 + subq $16 * (8 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_GfamjmilndFvzhv + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_GfamjmilndFvzhv +.L_small_initial_partial_block_GfamjmilndFvzhv: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_GfamjmilndFvzhv: + + orq %r8,%r8 + je .L_after_reduction_GfamjmilndFvzhv + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_GfamjmilndFvzhv: + jmp .L_last_blocks_done_hommwsmBDghhsCD +.L_last_num_blocks_is_9_hommwsmBDghhsCD: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $247,%r15d + jae .L_16_blocks_overflow_ycGahwjqkughsCy + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %xmm27,%xmm3,%xmm4 + jmp .L_16_blocks_ok_ycGahwjqkughsCy + +.L_16_blocks_overflow_ycGahwjqkughsCy: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %xmm29,%xmm4,%xmm4 +.L_16_blocks_ok_ycGahwjqkughsCy: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $0,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %xmm30,%xmm4,%xmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %xmm20,%xmm4,%xmm4 + vextracti32x4 $0,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %xmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %xmm29,%xmm4,%xmm20 + vextracti32x4 $0,%zmm20,%xmm7 + subq $16 * (9 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_oodBdsqrimpGlcx + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_oodBdsqrimpGlcx +.L_small_initial_partial_block_oodBdsqrimpGlcx: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_oodBdsqrimpGlcx: + + orq %r8,%r8 + je .L_after_reduction_oodBdsqrimpGlcx + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_oodBdsqrimpGlcx: + jmp .L_last_blocks_done_hommwsmBDghhsCD +.L_last_num_blocks_is_10_hommwsmBDghhsCD: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $246,%r15d + jae .L_16_blocks_overflow_qvAdocAzEtlnyGa + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %ymm27,%ymm3,%ymm4 + jmp .L_16_blocks_ok_qvAdocAzEtlnyGa + +.L_16_blocks_overflow_qvAdocAzEtlnyGa: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %ymm29,%ymm4,%ymm4 +.L_16_blocks_ok_qvAdocAzEtlnyGa: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $1,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %ymm30,%ymm4,%ymm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %ymm20,%ymm4,%ymm4 + vextracti32x4 $1,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %ymm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %ymm29,%ymm4,%ymm20 + vextracti32x4 $1,%zmm20,%xmm7 + subq $16 * (10 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_sDpafzbwGCbyCCy + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_sDpafzbwGCbyCCy +.L_small_initial_partial_block_sDpafzbwGCbyCCy: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_sDpafzbwGCbyCCy: + + orq %r8,%r8 + je .L_after_reduction_sDpafzbwGCbyCCy + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_sDpafzbwGCbyCCy: + jmp .L_last_blocks_done_hommwsmBDghhsCD +.L_last_num_blocks_is_11_hommwsmBDghhsCD: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $245,%r15d + jae .L_16_blocks_overflow_BGwcgjgblbFBkyn + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_BGwcgjgblbFBkyn + +.L_16_blocks_overflow_BGwcgjgblbFBkyn: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_BGwcgjgblbFBkyn: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $2,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $2,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vextracti32x4 $2,%zmm20,%xmm7 + subq $16 * (11 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_oDmcaDazcjvlCqo + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_oDmcaDazcjvlCqo +.L_small_initial_partial_block_oDmcaDazcjvlCqo: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_oDmcaDazcjvlCqo: + + orq %r8,%r8 + je .L_after_reduction_oDmcaDazcjvlCqo + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_oDmcaDazcjvlCqo: + jmp .L_last_blocks_done_hommwsmBDghhsCD +.L_last_num_blocks_is_12_hommwsmBDghhsCD: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $244,%r15d + jae .L_16_blocks_overflow_ooGtexyxfikBFDA + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_ooGtexyxfikBFDA + +.L_16_blocks_overflow_ooGtexyxfikBFDA: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_ooGtexyxfikBFDA: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $3,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $3,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vextracti32x4 $3,%zmm20,%xmm7 + subq $16 * (12 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_hawFrugxuDsFkwh + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_hawFrugxuDsFkwh +.L_small_initial_partial_block_hawFrugxuDsFkwh: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_hawFrugxuDsFkwh: + + orq %r8,%r8 + je .L_after_reduction_hawFrugxuDsFkwh + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_hawFrugxuDsFkwh: + jmp .L_last_blocks_done_hommwsmBDghhsCD +.L_last_num_blocks_is_13_hommwsmBDghhsCD: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $243,%r15d + jae .L_16_blocks_overflow_ffjezAuFCnhGagx + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %xmm27,%xmm4,%xmm5 + jmp .L_16_blocks_ok_ffjezAuFCnhGagx + +.L_16_blocks_overflow_ffjezAuFCnhGagx: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %xmm29,%xmm5,%xmm5 +.L_16_blocks_ok_ffjezAuFCnhGagx: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $0,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %xmm30,%xmm5,%xmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %xmm21,%xmm5,%xmm5 + vextracti32x4 $0,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %xmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %xmm29,%xmm5,%xmm21 + vextracti32x4 $0,%zmm21,%xmm7 + subq $16 * (13 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_nszsngmcgAavfgo + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_nszsngmcgAavfgo +.L_small_initial_partial_block_nszsngmcgAavfgo: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_nszsngmcgAavfgo: + + orq %r8,%r8 + je .L_after_reduction_nszsngmcgAavfgo + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_nszsngmcgAavfgo: + jmp .L_last_blocks_done_hommwsmBDghhsCD +.L_last_num_blocks_is_14_hommwsmBDghhsCD: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $242,%r15d + jae .L_16_blocks_overflow_CfdCFDnjwhDDuze + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %ymm27,%ymm4,%ymm5 + jmp .L_16_blocks_ok_CfdCFDnjwhDDuze + +.L_16_blocks_overflow_CfdCFDnjwhDDuze: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %ymm29,%ymm5,%ymm5 +.L_16_blocks_ok_CfdCFDnjwhDDuze: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $1,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %ymm30,%ymm5,%ymm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %ymm21,%ymm5,%ymm5 + vextracti32x4 $1,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %ymm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %ymm29,%ymm5,%ymm21 + vextracti32x4 $1,%zmm21,%xmm7 + subq $16 * (14 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_nnhzacbBeBgBwss + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_nnhzacbBeBgBwss +.L_small_initial_partial_block_nnhzacbBeBgBwss: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_nnhzacbBeBgBwss: + + orq %r8,%r8 + je .L_after_reduction_nnhzacbBeBgBwss + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_nnhzacbBeBgBwss: + jmp .L_last_blocks_done_hommwsmBDghhsCD +.L_last_num_blocks_is_15_hommwsmBDghhsCD: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $241,%r15d + jae .L_16_blocks_overflow_GAcGndzbDEvCwfz + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_GAcGndzbDEvCwfz + +.L_16_blocks_overflow_GAcGndzbDEvCwfz: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_GAcGndzbDEvCwfz: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $2,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $2,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %zmm29,%zmm5,%zmm21 + vextracti32x4 $2,%zmm21,%xmm7 + subq $16 * (15 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_kpsoetidpdjlnwh + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_kpsoetidpdjlnwh +.L_small_initial_partial_block_kpsoetidpdjlnwh: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_kpsoetidpdjlnwh: + + orq %r8,%r8 + je .L_after_reduction_kpsoetidpdjlnwh + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_kpsoetidpdjlnwh: + jmp .L_last_blocks_done_hommwsmBDghhsCD +.L_last_num_blocks_is_16_hommwsmBDghhsCD: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $240,%r15d + jae .L_16_blocks_overflow_wpowiymzckfpmlc + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_wpowiymzckfpmlc + +.L_16_blocks_overflow_wpowiymzckfpmlc: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_wpowiymzckfpmlc: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $3,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $3,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %zmm29,%zmm5,%zmm21 + vextracti32x4 $3,%zmm21,%xmm7 + subq $16 * (16 - 1),%r8 +.L_small_initial_partial_block_xjewDEdrojAwizl: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_xjewDEdrojAwizl: + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_xjewDEdrojAwizl: + jmp .L_last_blocks_done_hommwsmBDghhsCD +.L_last_num_blocks_is_0_hommwsmBDghhsCD: + vmovdqa64 1280(%rsp),%zmm13 + vmovdqu64 512(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1344(%rsp),%zmm13 + vmovdqu64 576(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 1408(%rsp),%zmm13 + vmovdqu64 640(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1472(%rsp),%zmm13 + vmovdqu64 704(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm4 + + + vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0 + vpslldq $8,%xmm0,%xmm0 + vpxorq %xmm0,%xmm25,%xmm0 + + + vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3 + vpsrldq $4,%xmm3,%xmm3 + vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm3,%xmm14 + +.L_last_blocks_done_hommwsmBDghhsCD: + vpshufb %xmm29,%xmm2,%xmm2 + jmp .L_ghash_done_pdDdEbGtmhbgzzj + +.L_message_below_32_blocks_pdDdEbGtmhbgzzj: + + + subq $256,%r8 + addq $256,%r11 + movl %r8d,%r10d + testq %r14,%r14 + jnz .L_skip_hkeys_precomputation_zxFmdGhwegjCAGr + vmovdqu64 640(%rsp),%zmm3 + + + vshufi64x2 $0x00,%zmm3,%zmm3,%zmm3 + + vmovdqu64 576(%rsp),%zmm4 + vmovdqu64 512(%rsp),%zmm5 + + vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4 + vpxorq %zmm10,%zmm4,%zmm4 + + vpsrldq $8,%zmm4,%zmm10 + vpslldq $8,%zmm4,%zmm4 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4 + vpslldq $4,%zmm4,%zmm4 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm4 + + vmovdqu64 %zmm4,448(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5 + vpxorq %zmm10,%zmm5,%zmm5 + + vpsrldq $8,%zmm5,%zmm10 + vpslldq $8,%zmm5,%zmm5 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5 + vpslldq $4,%zmm5,%zmm5 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm5 + + vmovdqu64 %zmm5,384(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4 + vpxorq %zmm10,%zmm4,%zmm4 + + vpsrldq $8,%zmm4,%zmm10 + vpslldq $8,%zmm4,%zmm4 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4 + vpslldq $4,%zmm4,%zmm4 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm4 + + vmovdqu64 %zmm4,320(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5 + vpxorq %zmm10,%zmm5,%zmm5 + + vpsrldq $8,%zmm5,%zmm10 + vpslldq $8,%zmm5,%zmm5 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5 + vpslldq $4,%zmm5,%zmm5 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm5 + + vmovdqu64 %zmm5,256(%rsp) +.L_skip_hkeys_precomputation_zxFmdGhwegjCAGr: + movq $1,%r14 + andl $~15,%r10d + movl $512,%ebx + subl %r10d,%ebx + movl %r8d,%r10d + addl $15,%r10d + shrl $4,%r10d + je .L_last_num_blocks_is_0_yEtjCjlkazyuxae + + cmpl $8,%r10d + je .L_last_num_blocks_is_8_yEtjCjlkazyuxae + jb .L_last_num_blocks_is_7_1_yEtjCjlkazyuxae + + + cmpl $12,%r10d + je .L_last_num_blocks_is_12_yEtjCjlkazyuxae + jb .L_last_num_blocks_is_11_9_yEtjCjlkazyuxae + + + cmpl $15,%r10d + je .L_last_num_blocks_is_15_yEtjCjlkazyuxae + ja .L_last_num_blocks_is_16_yEtjCjlkazyuxae + cmpl $14,%r10d + je .L_last_num_blocks_is_14_yEtjCjlkazyuxae + jmp .L_last_num_blocks_is_13_yEtjCjlkazyuxae + +.L_last_num_blocks_is_11_9_yEtjCjlkazyuxae: + + cmpl $10,%r10d + je .L_last_num_blocks_is_10_yEtjCjlkazyuxae + ja .L_last_num_blocks_is_11_yEtjCjlkazyuxae + jmp .L_last_num_blocks_is_9_yEtjCjlkazyuxae + +.L_last_num_blocks_is_7_1_yEtjCjlkazyuxae: + cmpl $4,%r10d + je .L_last_num_blocks_is_4_yEtjCjlkazyuxae + jb .L_last_num_blocks_is_3_1_yEtjCjlkazyuxae + + cmpl $6,%r10d + ja .L_last_num_blocks_is_7_yEtjCjlkazyuxae + je .L_last_num_blocks_is_6_yEtjCjlkazyuxae + jmp .L_last_num_blocks_is_5_yEtjCjlkazyuxae + +.L_last_num_blocks_is_3_1_yEtjCjlkazyuxae: + + cmpl $2,%r10d + ja .L_last_num_blocks_is_3_yEtjCjlkazyuxae + je .L_last_num_blocks_is_2_yEtjCjlkazyuxae +.L_last_num_blocks_is_1_yEtjCjlkazyuxae: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $255,%r15d + jae .L_16_blocks_overflow_GemCxiwxneizpok + vpaddd %xmm28,%xmm2,%xmm0 + jmp .L_16_blocks_ok_GemCxiwxneizpok + +.L_16_blocks_overflow_GemCxiwxneizpok: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %xmm29,%xmm0,%xmm0 +.L_16_blocks_ok_GemCxiwxneizpok: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z} + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vaesenclast %xmm30,%xmm0,%xmm0 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti32x4 $0,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %xmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %xmm29,%xmm0,%xmm17 + vextracti32x4 $0,%zmm17,%xmm7 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_lDxtxBkDCvCDeAu + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_lDxtxBkDCvCDeAu +.L_small_initial_partial_block_lDxtxBkDCvCDeAu: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm0 + + + vpclmulqdq $0x01,%xmm25,%xmm0,%xmm3 + vpslldq $8,%xmm3,%xmm3 + vpxorq %xmm3,%xmm25,%xmm3 + + + vpclmulqdq $0x00,%xmm3,%xmm0,%xmm4 + vpsrldq $4,%xmm4,%xmm4 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm4,%xmm14 + + + + + + + + + + + + + vpxorq %xmm7,%xmm14,%xmm14 + + jmp .L_after_reduction_lDxtxBkDCvCDeAu +.L_small_initial_compute_done_lDxtxBkDCvCDeAu: +.L_after_reduction_lDxtxBkDCvCDeAu: + jmp .L_last_blocks_done_yEtjCjlkazyuxae +.L_last_num_blocks_is_2_yEtjCjlkazyuxae: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $254,%r15d + jae .L_16_blocks_overflow_mtbzanedDzblhBt + vpaddd %ymm28,%ymm2,%ymm0 + jmp .L_16_blocks_ok_mtbzanedDzblhBt + +.L_16_blocks_overflow_mtbzanedDzblhBt: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %ymm29,%ymm0,%ymm0 +.L_16_blocks_ok_mtbzanedDzblhBt: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z} + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vaesenclast %ymm30,%ymm0,%ymm0 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %ymm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %ymm29,%ymm0,%ymm17 + vextracti32x4 $1,%zmm17,%xmm7 + subq $16 * (2 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_vDfEzdpCaoutqpk + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_vDfEzdpCaoutqpk +.L_small_initial_partial_block_vDfEzdpCaoutqpk: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_vDfEzdpCaoutqpk: + + orq %r8,%r8 + je .L_after_reduction_vDfEzdpCaoutqpk + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_vDfEzdpCaoutqpk: + jmp .L_last_blocks_done_yEtjCjlkazyuxae +.L_last_num_blocks_is_3_yEtjCjlkazyuxae: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $253,%r15d + jae .L_16_blocks_overflow_dEDrjDhcyydvacb + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_dEDrjDhcyydvacb + +.L_16_blocks_overflow_dEDrjDhcyydvacb: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_dEDrjDhcyydvacb: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $2,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vextracti32x4 $2,%zmm17,%xmm7 + subq $16 * (3 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_ulcxboFccGvxqoA + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_ulcxboFccGvxqoA +.L_small_initial_partial_block_ulcxboFccGvxqoA: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_ulcxboFccGvxqoA: + + orq %r8,%r8 + je .L_after_reduction_ulcxboFccGvxqoA + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_ulcxboFccGvxqoA: + jmp .L_last_blocks_done_yEtjCjlkazyuxae +.L_last_num_blocks_is_4_yEtjCjlkazyuxae: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $252,%r15d + jae .L_16_blocks_overflow_oDxtFmsewqDacsh + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_oDxtFmsewqDacsh + +.L_16_blocks_overflow_oDxtFmsewqDacsh: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_oDxtFmsewqDacsh: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $3,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vextracti32x4 $3,%zmm17,%xmm7 + subq $16 * (4 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_vugvwEfszCpbGFf + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_vugvwEfszCpbGFf +.L_small_initial_partial_block_vugvwEfszCpbGFf: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_vugvwEfszCpbGFf: + + orq %r8,%r8 + je .L_after_reduction_vugvwEfszCpbGFf + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_vugvwEfszCpbGFf: + jmp .L_last_blocks_done_yEtjCjlkazyuxae +.L_last_num_blocks_is_5_yEtjCjlkazyuxae: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $251,%r15d + jae .L_16_blocks_overflow_xkcGkGACdgyhfnk + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %xmm27,%xmm0,%xmm3 + jmp .L_16_blocks_ok_xkcGkGACdgyhfnk + +.L_16_blocks_overflow_xkcGkGACdgyhfnk: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %xmm29,%xmm3,%xmm3 +.L_16_blocks_ok_xkcGkGACdgyhfnk: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %xmm30,%xmm3,%xmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %xmm19,%xmm3,%xmm3 + vextracti32x4 $0,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %xmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %xmm29,%xmm3,%xmm19 + vextracti32x4 $0,%zmm19,%xmm7 + subq $16 * (5 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_ztfihBbCfBvyfov + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_ztfihBbCfBvyfov +.L_small_initial_partial_block_ztfihBbCfBvyfov: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_ztfihBbCfBvyfov: + + orq %r8,%r8 + je .L_after_reduction_ztfihBbCfBvyfov + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_ztfihBbCfBvyfov: + jmp .L_last_blocks_done_yEtjCjlkazyuxae +.L_last_num_blocks_is_6_yEtjCjlkazyuxae: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $250,%r15d + jae .L_16_blocks_overflow_xlFpBxEfzmCmemF + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %ymm27,%ymm0,%ymm3 + jmp .L_16_blocks_ok_xlFpBxEfzmCmemF + +.L_16_blocks_overflow_xlFpBxEfzmCmemF: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %ymm29,%ymm3,%ymm3 +.L_16_blocks_ok_xlFpBxEfzmCmemF: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %ymm30,%ymm3,%ymm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %ymm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %ymm29,%ymm3,%ymm19 + vextracti32x4 $1,%zmm19,%xmm7 + subq $16 * (6 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_lxGrFedjGdoqthf + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_lxGrFedjGdoqthf +.L_small_initial_partial_block_lxGrFedjGdoqthf: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_lxGrFedjGdoqthf: + + orq %r8,%r8 + je .L_after_reduction_lxGrFedjGdoqthf + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_lxGrFedjGdoqthf: + jmp .L_last_blocks_done_yEtjCjlkazyuxae +.L_last_num_blocks_is_7_yEtjCjlkazyuxae: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $249,%r15d + jae .L_16_blocks_overflow_epvGyiwrthhFeDk + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_epvGyiwrthhFeDk + +.L_16_blocks_overflow_epvGyiwrthhFeDk: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_epvGyiwrthhFeDk: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $2,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vextracti32x4 $2,%zmm19,%xmm7 + subq $16 * (7 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_lDmxfclvwFuFuGn + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_lDmxfclvwFuFuGn +.L_small_initial_partial_block_lDmxfclvwFuFuGn: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_lDmxfclvwFuFuGn: + + orq %r8,%r8 + je .L_after_reduction_lDmxfclvwFuFuGn + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_lDmxfclvwFuFuGn: + jmp .L_last_blocks_done_yEtjCjlkazyuxae +.L_last_num_blocks_is_8_yEtjCjlkazyuxae: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $248,%r15d + jae .L_16_blocks_overflow_rlpnCjhhrhBjnBv + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_rlpnCjhhrhBjnBv + +.L_16_blocks_overflow_rlpnCjhhrhBjnBv: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_rlpnCjhhrhBjnBv: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $3,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vextracti32x4 $3,%zmm19,%xmm7 + subq $16 * (8 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_wCmlnxlmuAqfmku + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_wCmlnxlmuAqfmku +.L_small_initial_partial_block_wCmlnxlmuAqfmku: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_wCmlnxlmuAqfmku: + + orq %r8,%r8 + je .L_after_reduction_wCmlnxlmuAqfmku + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_wCmlnxlmuAqfmku: + jmp .L_last_blocks_done_yEtjCjlkazyuxae +.L_last_num_blocks_is_9_yEtjCjlkazyuxae: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $247,%r15d + jae .L_16_blocks_overflow_xGcqvoGCBlCvFjF + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %xmm27,%xmm3,%xmm4 + jmp .L_16_blocks_ok_xGcqvoGCBlCvFjF + +.L_16_blocks_overflow_xGcqvoGCBlCvFjF: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %xmm29,%xmm4,%xmm4 +.L_16_blocks_ok_xGcqvoGCBlCvFjF: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %xmm30,%xmm4,%xmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %xmm20,%xmm4,%xmm4 + vextracti32x4 $0,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %xmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %xmm29,%xmm4,%xmm20 + vextracti32x4 $0,%zmm20,%xmm7 + subq $16 * (9 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_uoAmEEFbAhessra + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_uoAmEEFbAhessra +.L_small_initial_partial_block_uoAmEEFbAhessra: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_uoAmEEFbAhessra: + + orq %r8,%r8 + je .L_after_reduction_uoAmEEFbAhessra + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_uoAmEEFbAhessra: + jmp .L_last_blocks_done_yEtjCjlkazyuxae +.L_last_num_blocks_is_10_yEtjCjlkazyuxae: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $246,%r15d + jae .L_16_blocks_overflow_lxwlEahBzykFvop + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %ymm27,%ymm3,%ymm4 + jmp .L_16_blocks_ok_lxwlEahBzykFvop + +.L_16_blocks_overflow_lxwlEahBzykFvop: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %ymm29,%ymm4,%ymm4 +.L_16_blocks_ok_lxwlEahBzykFvop: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %ymm30,%ymm4,%ymm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %ymm20,%ymm4,%ymm4 + vextracti32x4 $1,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %ymm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %ymm29,%ymm4,%ymm20 + vextracti32x4 $1,%zmm20,%xmm7 + subq $16 * (10 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_ymGqwwcaDlhrzht + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_ymGqwwcaDlhrzht +.L_small_initial_partial_block_ymGqwwcaDlhrzht: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_ymGqwwcaDlhrzht: + + orq %r8,%r8 + je .L_after_reduction_ymGqwwcaDlhrzht + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_ymGqwwcaDlhrzht: + jmp .L_last_blocks_done_yEtjCjlkazyuxae +.L_last_num_blocks_is_11_yEtjCjlkazyuxae: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $245,%r15d + jae .L_16_blocks_overflow_DwphDuBmGjsjgos + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_DwphDuBmGjsjgos + +.L_16_blocks_overflow_DwphDuBmGjsjgos: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_DwphDuBmGjsjgos: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $2,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vextracti32x4 $2,%zmm20,%xmm7 + subq $16 * (11 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_feadFtsqxgxipCv + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_feadFtsqxgxipCv +.L_small_initial_partial_block_feadFtsqxgxipCv: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_feadFtsqxgxipCv: + + orq %r8,%r8 + je .L_after_reduction_feadFtsqxgxipCv + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_feadFtsqxgxipCv: + jmp .L_last_blocks_done_yEtjCjlkazyuxae +.L_last_num_blocks_is_12_yEtjCjlkazyuxae: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $244,%r15d + jae .L_16_blocks_overflow_tysgGmlzxDCuchk + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_tysgGmlzxDCuchk + +.L_16_blocks_overflow_tysgGmlzxDCuchk: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_tysgGmlzxDCuchk: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $3,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vextracti32x4 $3,%zmm20,%xmm7 + subq $16 * (12 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_jdvGApyCGfzBhpb + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_jdvGApyCGfzBhpb +.L_small_initial_partial_block_jdvGApyCGfzBhpb: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_jdvGApyCGfzBhpb: + + orq %r8,%r8 + je .L_after_reduction_jdvGApyCGfzBhpb + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_jdvGApyCGfzBhpb: + jmp .L_last_blocks_done_yEtjCjlkazyuxae +.L_last_num_blocks_is_13_yEtjCjlkazyuxae: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $243,%r15d + jae .L_16_blocks_overflow_halbrdjstkvuogl + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %xmm27,%xmm4,%xmm5 + jmp .L_16_blocks_ok_halbrdjstkvuogl + +.L_16_blocks_overflow_halbrdjstkvuogl: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %xmm29,%xmm5,%xmm5 +.L_16_blocks_ok_halbrdjstkvuogl: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %xmm30,%xmm5,%xmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %xmm21,%xmm5,%xmm5 + vextracti32x4 $0,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %xmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %xmm29,%xmm5,%xmm21 + vextracti32x4 $0,%zmm21,%xmm7 + subq $16 * (13 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_pdxowiCmkqsedqs + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_pdxowiCmkqsedqs +.L_small_initial_partial_block_pdxowiCmkqsedqs: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_pdxowiCmkqsedqs: + + orq %r8,%r8 + je .L_after_reduction_pdxowiCmkqsedqs + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_pdxowiCmkqsedqs: + jmp .L_last_blocks_done_yEtjCjlkazyuxae +.L_last_num_blocks_is_14_yEtjCjlkazyuxae: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $242,%r15d + jae .L_16_blocks_overflow_qlykidCbnDmCaom + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %ymm27,%ymm4,%ymm5 + jmp .L_16_blocks_ok_qlykidCbnDmCaom + +.L_16_blocks_overflow_qlykidCbnDmCaom: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %ymm29,%ymm5,%ymm5 +.L_16_blocks_ok_qlykidCbnDmCaom: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %ymm30,%ymm5,%ymm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %ymm21,%ymm5,%ymm5 + vextracti32x4 $1,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %ymm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %ymm29,%ymm5,%ymm21 + vextracti32x4 $1,%zmm21,%xmm7 + subq $16 * (14 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_bCGuxGwffFmkxlq + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_bCGuxGwffFmkxlq +.L_small_initial_partial_block_bCGuxGwffFmkxlq: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_bCGuxGwffFmkxlq: + + orq %r8,%r8 + je .L_after_reduction_bCGuxGwffFmkxlq + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_bCGuxGwffFmkxlq: + jmp .L_last_blocks_done_yEtjCjlkazyuxae +.L_last_num_blocks_is_15_yEtjCjlkazyuxae: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $241,%r15d + jae .L_16_blocks_overflow_tvonowlqiEmbpqm + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_tvonowlqiEmbpqm + +.L_16_blocks_overflow_tvonowlqiEmbpqm: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_tvonowlqiEmbpqm: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $2,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %zmm29,%zmm5,%zmm21 + vextracti32x4 $2,%zmm21,%xmm7 + subq $16 * (15 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_dlvvxnvpiqivacr + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_dlvvxnvpiqivacr +.L_small_initial_partial_block_dlvvxnvpiqivacr: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_dlvvxnvpiqivacr: + + orq %r8,%r8 + je .L_after_reduction_dlvvxnvpiqivacr + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_dlvvxnvpiqivacr: + jmp .L_last_blocks_done_yEtjCjlkazyuxae +.L_last_num_blocks_is_16_yEtjCjlkazyuxae: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $240,%r15d + jae .L_16_blocks_overflow_nqzepvdnfxxrztt + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_nqzepvdnfxxrztt + +.L_16_blocks_overflow_nqzepvdnfxxrztt: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_nqzepvdnfxxrztt: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $3,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %zmm29,%zmm5,%zmm21 + vextracti32x4 $3,%zmm21,%xmm7 + subq $16 * (16 - 1),%r8 +.L_small_initial_partial_block_bBybkCcjjhhjGnD: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_bBybkCcjjhhjGnD: + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_bBybkCcjjhhjGnD: + jmp .L_last_blocks_done_yEtjCjlkazyuxae +.L_last_num_blocks_is_0_yEtjCjlkazyuxae: + vmovdqa64 768(%rsp),%zmm13 + vpxorq %zmm14,%zmm13,%zmm13 + vmovdqu64 0(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 832(%rsp),%zmm13 + vmovdqu64 64(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpxorq %zmm10,%zmm4,%zmm26 + vpxorq %zmm6,%zmm0,%zmm24 + vpxorq %zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 896(%rsp),%zmm13 + vmovdqu64 128(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 960(%rsp),%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm4 + + + vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0 + vpslldq $8,%xmm0,%xmm0 + vpxorq %xmm0,%xmm25,%xmm0 + + + vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3 + vpsrldq $4,%xmm3,%xmm3 + vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm3,%xmm14 + +.L_last_blocks_done_yEtjCjlkazyuxae: + vpshufb %xmm29,%xmm2,%xmm2 + jmp .L_ghash_done_pdDdEbGtmhbgzzj + +.L_message_below_equal_16_blocks_pdDdEbGtmhbgzzj: + + + movl %r8d,%r12d + addl $15,%r12d + shrl $4,%r12d + cmpq $8,%r12 + je .L_small_initial_num_blocks_is_8_ewuGsEvelaCkirh + jl .L_small_initial_num_blocks_is_7_1_ewuGsEvelaCkirh + + + cmpq $12,%r12 + je .L_small_initial_num_blocks_is_12_ewuGsEvelaCkirh + jl .L_small_initial_num_blocks_is_11_9_ewuGsEvelaCkirh + + + cmpq $16,%r12 + je .L_small_initial_num_blocks_is_16_ewuGsEvelaCkirh + cmpq $15,%r12 + je .L_small_initial_num_blocks_is_15_ewuGsEvelaCkirh + cmpq $14,%r12 + je .L_small_initial_num_blocks_is_14_ewuGsEvelaCkirh + jmp .L_small_initial_num_blocks_is_13_ewuGsEvelaCkirh + +.L_small_initial_num_blocks_is_11_9_ewuGsEvelaCkirh: + + cmpq $11,%r12 + je .L_small_initial_num_blocks_is_11_ewuGsEvelaCkirh + cmpq $10,%r12 + je .L_small_initial_num_blocks_is_10_ewuGsEvelaCkirh + jmp .L_small_initial_num_blocks_is_9_ewuGsEvelaCkirh + +.L_small_initial_num_blocks_is_7_1_ewuGsEvelaCkirh: + cmpq $4,%r12 + je .L_small_initial_num_blocks_is_4_ewuGsEvelaCkirh + jl .L_small_initial_num_blocks_is_3_1_ewuGsEvelaCkirh + + cmpq $7,%r12 + je .L_small_initial_num_blocks_is_7_ewuGsEvelaCkirh + cmpq $6,%r12 + je .L_small_initial_num_blocks_is_6_ewuGsEvelaCkirh + jmp .L_small_initial_num_blocks_is_5_ewuGsEvelaCkirh + +.L_small_initial_num_blocks_is_3_1_ewuGsEvelaCkirh: + + cmpq $3,%r12 + je .L_small_initial_num_blocks_is_3_ewuGsEvelaCkirh + cmpq $2,%r12 + je .L_small_initial_num_blocks_is_2_ewuGsEvelaCkirh + + + + + +.L_small_initial_num_blocks_is_1_ewuGsEvelaCkirh: + vmovdqa64 SHUF_MASK(%rip),%xmm29 + vpaddd ONE(%rip),%xmm2,%xmm0 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $0,%zmm0,%xmm2 + vpshufb %xmm29,%xmm0,%xmm0 + vmovdqu8 0(%rcx,%r11,1),%xmm6{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenclast %xmm15,%xmm0,%xmm0 + vpxorq %xmm6,%xmm0,%xmm0 + vextracti32x4 $0,%zmm0,%xmm12 + movq %r9,%r10 + vmovdqu8 %xmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %xmm29,%xmm0,%xmm6 + vextracti32x4 $0,%zmm6,%xmm13 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_qAfhfumcaDjruco + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm6,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm6,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm6,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm6,%xmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_qAfhfumcaDjruco +.L_small_initial_partial_block_qAfhfumcaDjruco: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + + + + + + + + + + + + vpxorq %xmm13,%xmm14,%xmm14 + + jmp .L_after_reduction_qAfhfumcaDjruco +.L_small_initial_compute_done_qAfhfumcaDjruco: +.L_after_reduction_qAfhfumcaDjruco: + jmp .L_small_initial_blocks_encrypted_ewuGsEvelaCkirh +.L_small_initial_num_blocks_is_2_ewuGsEvelaCkirh: + vmovdqa64 SHUF_MASK(%rip),%ymm29 + vshufi64x2 $0,%ymm2,%ymm2,%ymm0 + vpaddd ddq_add_1234(%rip),%ymm0,%ymm0 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $1,%zmm0,%xmm2 + vpshufb %ymm29,%ymm0,%ymm0 + vmovdqu8 0(%rcx,%r11,1),%ymm6{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenclast %ymm15,%ymm0,%ymm0 + vpxorq %ymm6,%ymm0,%ymm0 + vextracti32x4 $1,%zmm0,%xmm12 + movq %r9,%r10 + vmovdqu8 %ymm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %ymm29,%ymm0,%ymm6 + vextracti32x4 $1,%zmm6,%xmm13 + subq $16 * (2 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_ubuBFaxsGrnemfF + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm6,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm6,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm6,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm6,%ymm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_ubuBFaxsGrnemfF +.L_small_initial_partial_block_ubuBFaxsGrnemfF: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm6,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm6,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm6,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm6,%xmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_ubuBFaxsGrnemfF: + + orq %r8,%r8 + je .L_after_reduction_ubuBFaxsGrnemfF + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_ubuBFaxsGrnemfF: + jmp .L_small_initial_blocks_encrypted_ewuGsEvelaCkirh +.L_small_initial_num_blocks_is_3_ewuGsEvelaCkirh: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $2,%zmm0,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vmovdqu8 0(%rcx,%r11,1),%zmm6{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vpxorq %zmm6,%zmm0,%zmm0 + vextracti32x4 $2,%zmm0,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vextracti32x4 $2,%zmm6,%xmm13 + subq $16 * (3 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_ndaAlsscEjpEkoq + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_ndaAlsscEjpEkoq +.L_small_initial_partial_block_ndaAlsscEjpEkoq: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm6,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm6,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm6,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm6,%ymm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_ndaAlsscEjpEkoq: + + orq %r8,%r8 + je .L_after_reduction_ndaAlsscEjpEkoq + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_ndaAlsscEjpEkoq: + jmp .L_small_initial_blocks_encrypted_ewuGsEvelaCkirh +.L_small_initial_num_blocks_is_4_ewuGsEvelaCkirh: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $3,%zmm0,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vmovdqu8 0(%rcx,%r11,1),%zmm6{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vpxorq %zmm6,%zmm0,%zmm0 + vextracti32x4 $3,%zmm0,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vextracti32x4 $3,%zmm6,%xmm13 + subq $16 * (4 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_jktiGoAbGDiFkaq + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 288(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + + vpxorq %zmm19,%zmm17,%zmm17 + vpsrldq $8,%zmm17,%zmm4 + vpslldq $8,%zmm17,%zmm5 + vpxorq %zmm4,%zmm15,%zmm0 + vpxorq %zmm5,%zmm16,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_jktiGoAbGDiFkaq +.L_small_initial_partial_block_jktiGoAbGDiFkaq: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_jktiGoAbGDiFkaq: + + orq %r8,%r8 + je .L_after_reduction_jktiGoAbGDiFkaq + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_jktiGoAbGDiFkaq: + jmp .L_small_initial_blocks_encrypted_ewuGsEvelaCkirh +.L_small_initial_num_blocks_is_5_ewuGsEvelaCkirh: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $64,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $0,%zmm3,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %xmm29,%xmm3,%xmm3 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%xmm7{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %xmm15,%xmm3,%xmm3 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %xmm7,%xmm3,%xmm3 + vextracti32x4 $0,%zmm3,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %xmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vpshufb %xmm29,%xmm3,%xmm7 + vextracti32x4 $0,%zmm7,%xmm13 + subq $16 * (5 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_sEqEFsxphmltbmr + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 272(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm7,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm7,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm7,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm7,%xmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_sEqEFsxphmltbmr +.L_small_initial_partial_block_sEqEFsxphmltbmr: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 288(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + + vpxorq %zmm19,%zmm17,%zmm17 + vpsrldq $8,%zmm17,%zmm4 + vpslldq $8,%zmm17,%zmm5 + vpxorq %zmm4,%zmm15,%zmm0 + vpxorq %zmm5,%zmm16,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_sEqEFsxphmltbmr: + + orq %r8,%r8 + je .L_after_reduction_sEqEFsxphmltbmr + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_sEqEFsxphmltbmr: + jmp .L_small_initial_blocks_encrypted_ewuGsEvelaCkirh +.L_small_initial_num_blocks_is_6_ewuGsEvelaCkirh: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $64,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $1,%zmm3,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %ymm29,%ymm3,%ymm3 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%ymm7{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %ymm15,%ymm3,%ymm3 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %ymm7,%ymm3,%ymm3 + vextracti32x4 $1,%zmm3,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %ymm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vpshufb %ymm29,%ymm3,%ymm7 + vextracti32x4 $1,%zmm7,%xmm13 + subq $16 * (6 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_slpocbFrpsoiAib + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 256(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm7,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm7,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm7,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm7,%ymm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_slpocbFrpsoiAib +.L_small_initial_partial_block_slpocbFrpsoiAib: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 272(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm7,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm7,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm7,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm7,%xmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_slpocbFrpsoiAib: + + orq %r8,%r8 + je .L_after_reduction_slpocbFrpsoiAib + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_slpocbFrpsoiAib: + jmp .L_small_initial_blocks_encrypted_ewuGsEvelaCkirh +.L_small_initial_num_blocks_is_7_ewuGsEvelaCkirh: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $64,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $2,%zmm3,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vextracti32x4 $2,%zmm3,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vpshufb %zmm29,%zmm3,%zmm7 + vextracti32x4 $2,%zmm7,%xmm13 + subq $16 * (7 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_EEknGefGCzrkolw + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 240(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_EEknGefGCzrkolw +.L_small_initial_partial_block_EEknGefGCzrkolw: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 256(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm7,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm7,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm7,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm7,%ymm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_EEknGefGCzrkolw: + + orq %r8,%r8 + je .L_after_reduction_EEknGefGCzrkolw + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_EEknGefGCzrkolw: + jmp .L_small_initial_blocks_encrypted_ewuGsEvelaCkirh +.L_small_initial_num_blocks_is_8_ewuGsEvelaCkirh: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $64,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $3,%zmm3,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vextracti32x4 $3,%zmm3,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vpshufb %zmm29,%zmm3,%zmm7 + vextracti32x4 $3,%zmm7,%xmm13 + subq $16 * (8 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_qrgmfxpdazygeCe + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 224(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 288(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + + vpxorq %zmm19,%zmm17,%zmm17 + vpsrldq $8,%zmm17,%zmm4 + vpslldq $8,%zmm17,%zmm5 + vpxorq %zmm4,%zmm15,%zmm0 + vpxorq %zmm5,%zmm16,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_qrgmfxpdazygeCe +.L_small_initial_partial_block_qrgmfxpdazygeCe: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 240(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_qrgmfxpdazygeCe: + + orq %r8,%r8 + je .L_after_reduction_qrgmfxpdazygeCe + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_qrgmfxpdazygeCe: + jmp .L_small_initial_blocks_encrypted_ewuGsEvelaCkirh +.L_small_initial_num_blocks_is_9_ewuGsEvelaCkirh: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $128,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $0,%zmm4,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %xmm29,%xmm4,%xmm4 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%xmm10{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %xmm15,%xmm4,%xmm4 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %xmm10,%xmm4,%xmm4 + vextracti32x4 $0,%zmm4,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %xmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vpshufb %zmm29,%zmm3,%zmm7 + vpshufb %xmm29,%xmm4,%xmm10 + vextracti32x4 $0,%zmm10,%xmm13 + subq $16 * (9 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_ixdohjdwtejkAah + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 208(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 272(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm10,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm10,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm10,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm10,%xmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_ixdohjdwtejkAah +.L_small_initial_partial_block_ixdohjdwtejkAah: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 224(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 288(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + + vpxorq %zmm19,%zmm17,%zmm17 + vpsrldq $8,%zmm17,%zmm4 + vpslldq $8,%zmm17,%zmm5 + vpxorq %zmm4,%zmm15,%zmm0 + vpxorq %zmm5,%zmm16,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_ixdohjdwtejkAah: + + orq %r8,%r8 + je .L_after_reduction_ixdohjdwtejkAah + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_ixdohjdwtejkAah: + jmp .L_small_initial_blocks_encrypted_ewuGsEvelaCkirh +.L_small_initial_num_blocks_is_10_ewuGsEvelaCkirh: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $128,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $1,%zmm4,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %ymm29,%ymm4,%ymm4 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%ymm10{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %ymm15,%ymm4,%ymm4 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %ymm10,%ymm4,%ymm4 + vextracti32x4 $1,%zmm4,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %ymm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vpshufb %zmm29,%zmm3,%zmm7 + vpshufb %ymm29,%ymm4,%ymm10 + vextracti32x4 $1,%zmm10,%xmm13 + subq $16 * (10 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_kdvEyrakCtlldFt + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 192(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 256(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm10,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm10,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm10,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm10,%ymm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_kdvEyrakCtlldFt +.L_small_initial_partial_block_kdvEyrakCtlldFt: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 208(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 272(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm10,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm10,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm10,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm10,%xmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_kdvEyrakCtlldFt: + + orq %r8,%r8 + je .L_after_reduction_kdvEyrakCtlldFt + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_kdvEyrakCtlldFt: + jmp .L_small_initial_blocks_encrypted_ewuGsEvelaCkirh +.L_small_initial_num_blocks_is_11_ewuGsEvelaCkirh: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $128,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $2,%zmm4,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%zmm10{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %zmm15,%zmm4,%zmm4 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %zmm10,%zmm4,%zmm4 + vextracti32x4 $2,%zmm4,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vpshufb %zmm29,%zmm3,%zmm7 + vpshufb %zmm29,%zmm4,%zmm10 + vextracti32x4 $2,%zmm10,%xmm13 + subq $16 * (11 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_cutxzwGkeBggDqx + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 176(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 240(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_cutxzwGkeBggDqx +.L_small_initial_partial_block_cutxzwGkeBggDqx: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 192(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 256(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm10,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm10,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm10,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm10,%ymm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_cutxzwGkeBggDqx: + + orq %r8,%r8 + je .L_after_reduction_cutxzwGkeBggDqx + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_cutxzwGkeBggDqx: + jmp .L_small_initial_blocks_encrypted_ewuGsEvelaCkirh +.L_small_initial_num_blocks_is_12_ewuGsEvelaCkirh: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $128,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $3,%zmm4,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%zmm10{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %zmm15,%zmm4,%zmm4 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %zmm10,%zmm4,%zmm4 + vextracti32x4 $3,%zmm4,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vpshufb %zmm29,%zmm3,%zmm7 + vpshufb %zmm29,%zmm4,%zmm10 + vextracti32x4 $3,%zmm10,%xmm13 + subq $16 * (12 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_oqFnyhhlpeztanE + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 160(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 224(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 288(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + + vpxorq %zmm19,%zmm17,%zmm17 + vpsrldq $8,%zmm17,%zmm4 + vpslldq $8,%zmm17,%zmm5 + vpxorq %zmm4,%zmm15,%zmm0 + vpxorq %zmm5,%zmm16,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_oqFnyhhlpeztanE +.L_small_initial_partial_block_oqFnyhhlpeztanE: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 176(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 240(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_oqFnyhhlpeztanE: + + orq %r8,%r8 + je .L_after_reduction_oqFnyhhlpeztanE + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_oqFnyhhlpeztanE: + jmp .L_small_initial_blocks_encrypted_ewuGsEvelaCkirh +.L_small_initial_num_blocks_is_13_ewuGsEvelaCkirh: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + vpaddd ddq_add_8888(%rip),%zmm3,%zmm5 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $192,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $0,%zmm5,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %xmm29,%xmm5,%xmm5 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%zmm10 + vmovdqu8 192(%rcx,%r11,1),%xmm11{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %zmm15,%zmm4,%zmm4 + vpxorq %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %zmm15,%zmm4,%zmm4 + vaesenclast %xmm15,%xmm5,%xmm5 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %zmm10,%zmm4,%zmm4 + vpxorq %xmm11,%xmm5,%xmm5 + vextracti32x4 $0,%zmm5,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %xmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vpshufb %zmm29,%zmm3,%zmm7 + vpshufb %zmm29,%zmm4,%zmm10 + vpshufb %xmm29,%xmm5,%xmm11 + vextracti32x4 $0,%zmm11,%xmm13 + subq $16 * (13 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_mloEfjmpzzECCFk + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 144(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 208(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 272(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm11,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm11,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm11,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm11,%xmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_mloEfjmpzzECCFk +.L_small_initial_partial_block_mloEfjmpzzECCFk: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 160(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 224(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 288(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + + vpxorq %zmm19,%zmm17,%zmm17 + vpsrldq $8,%zmm17,%zmm4 + vpslldq $8,%zmm17,%zmm5 + vpxorq %zmm4,%zmm15,%zmm0 + vpxorq %zmm5,%zmm16,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_mloEfjmpzzECCFk: + + orq %r8,%r8 + je .L_after_reduction_mloEfjmpzzECCFk + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_mloEfjmpzzECCFk: + jmp .L_small_initial_blocks_encrypted_ewuGsEvelaCkirh +.L_small_initial_num_blocks_is_14_ewuGsEvelaCkirh: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + vpaddd ddq_add_8888(%rip),%zmm3,%zmm5 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $192,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $1,%zmm5,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %ymm29,%ymm5,%ymm5 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%zmm10 + vmovdqu8 192(%rcx,%r11,1),%ymm11{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %zmm15,%zmm4,%zmm4 + vpxorq %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %zmm15,%zmm4,%zmm4 + vaesenclast %ymm15,%ymm5,%ymm5 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %zmm10,%zmm4,%zmm4 + vpxorq %ymm11,%ymm5,%ymm5 + vextracti32x4 $1,%zmm5,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %ymm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vpshufb %zmm29,%zmm3,%zmm7 + vpshufb %zmm29,%zmm4,%zmm10 + vpshufb %ymm29,%ymm5,%ymm11 + vextracti32x4 $1,%zmm11,%xmm13 + subq $16 * (14 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_lokFbqCpdpswyxF + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 128(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 192(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 256(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm11,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm11,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm11,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm11,%ymm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_lokFbqCpdpswyxF +.L_small_initial_partial_block_lokFbqCpdpswyxF: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 144(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 208(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 272(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm11,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm11,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm11,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm11,%xmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_lokFbqCpdpswyxF: + + orq %r8,%r8 + je .L_after_reduction_lokFbqCpdpswyxF + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_lokFbqCpdpswyxF: + jmp .L_small_initial_blocks_encrypted_ewuGsEvelaCkirh +.L_small_initial_num_blocks_is_15_ewuGsEvelaCkirh: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + vpaddd ddq_add_8888(%rip),%zmm3,%zmm5 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $192,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $2,%zmm5,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%zmm10 + vmovdqu8 192(%rcx,%r11,1),%zmm11{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %zmm15,%zmm4,%zmm4 + vpxorq %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %zmm15,%zmm4,%zmm4 + vaesenclast %zmm15,%zmm5,%zmm5 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %zmm10,%zmm4,%zmm4 + vpxorq %zmm11,%zmm5,%zmm5 + vextracti32x4 $2,%zmm5,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vpshufb %zmm29,%zmm3,%zmm7 + vpshufb %zmm29,%zmm4,%zmm10 + vpshufb %zmm29,%zmm5,%zmm11 + vextracti32x4 $2,%zmm11,%xmm13 + subq $16 * (15 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_bmnsCorxdnheyAb + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 112(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 176(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 240(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm11,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm11,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm11,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm11,%zmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_bmnsCorxdnheyAb +.L_small_initial_partial_block_bmnsCorxdnheyAb: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 128(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 192(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 256(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm11,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm11,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm11,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm11,%ymm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_bmnsCorxdnheyAb: + + orq %r8,%r8 + je .L_after_reduction_bmnsCorxdnheyAb + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_bmnsCorxdnheyAb: + jmp .L_small_initial_blocks_encrypted_ewuGsEvelaCkirh +.L_small_initial_num_blocks_is_16_ewuGsEvelaCkirh: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + vpaddd ddq_add_8888(%rip),%zmm3,%zmm5 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $192,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $3,%zmm5,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%zmm10 + vmovdqu8 192(%rcx,%r11,1),%zmm11{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %zmm15,%zmm4,%zmm4 + vpxorq %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %zmm15,%zmm4,%zmm4 + vaesenclast %zmm15,%zmm5,%zmm5 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %zmm10,%zmm4,%zmm4 + vpxorq %zmm11,%zmm5,%zmm5 + vextracti32x4 $3,%zmm5,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vpshufb %zmm29,%zmm3,%zmm7 + vpshufb %zmm29,%zmm4,%zmm10 + vpshufb %zmm29,%zmm5,%zmm11 + vextracti32x4 $3,%zmm11,%xmm13 + subq $16 * (16 - 1),%r8 +.L_small_initial_partial_block_iGnlhalqoGhdkbv: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 112(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 176(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 240(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm11,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm11,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm11,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm11,%zmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_iGnlhalqoGhdkbv: + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_iGnlhalqoGhdkbv: +.L_small_initial_blocks_encrypted_ewuGsEvelaCkirh: +.L_ghash_done_pdDdEbGtmhbgzzj: + vmovdqu64 %xmm2,0(%rsi) + vmovdqu64 %xmm14,64(%rsi) +.L_enc_dec_done_pdDdEbGtmhbgzzj: + jmp .Lexit_gcm_encrypt +.align 32 +.Laes_gcm_encrypt_192_avx512: + orq %r8,%r8 + je .L_enc_dec_done_tFbkipsuzBAeEGF + xorq %r14,%r14 + vmovdqu64 64(%rsi),%xmm14 + + movq (%rdx),%r11 + orq %r11,%r11 + je .L_partial_block_done_jdCiCmGpmghGfDo + movl $16,%r10d + leaq byte_len_to_mask_table(%rip),%r12 + cmpq %r10,%r8 + cmovcq %r8,%r10 + kmovw (%r12,%r10,2),%k1 + vmovdqu8 (%rcx),%xmm0{%k1}{z} + + vmovdqu64 16(%rsi),%xmm3 + vmovdqu64 336(%rsi),%xmm4 + + + + leaq SHIFT_MASK(%rip),%r12 + addq %r11,%r12 + vmovdqu64 (%r12),%xmm5 + vpshufb %xmm5,%xmm3,%xmm3 + vpxorq %xmm0,%xmm3,%xmm3 + + + leaq (%r8,%r11,1),%r13 + subq $16,%r13 + jge .L_no_extra_mask_jdCiCmGpmghGfDo + subq %r13,%r12 +.L_no_extra_mask_jdCiCmGpmghGfDo: + + + + vmovdqu64 16(%r12),%xmm0 + vpand %xmm0,%xmm3,%xmm3 + vpshufb SHUF_MASK(%rip),%xmm3,%xmm3 + vpshufb %xmm5,%xmm3,%xmm3 + vpxorq %xmm3,%xmm14,%xmm14 + cmpq $0,%r13 + jl .L_partial_incomplete_jdCiCmGpmghGfDo + + vpclmulqdq $0x11,%xmm4,%xmm14,%xmm7 + vpclmulqdq $0x00,%xmm4,%xmm14,%xmm10 + vpclmulqdq $0x01,%xmm4,%xmm14,%xmm11 + vpclmulqdq $0x10,%xmm4,%xmm14,%xmm14 + vpxorq %xmm11,%xmm14,%xmm14 + + vpsrldq $8,%xmm14,%xmm11 + vpslldq $8,%xmm14,%xmm14 + vpxorq %xmm11,%xmm7,%xmm7 + vpxorq %xmm10,%xmm14,%xmm14 + + + + vmovdqu64 POLY2(%rip),%xmm11 + + vpclmulqdq $0x01,%xmm14,%xmm11,%xmm10 + vpslldq $8,%xmm10,%xmm10 + vpxorq %xmm10,%xmm14,%xmm14 + + + + vpclmulqdq $0x00,%xmm14,%xmm11,%xmm10 + vpsrldq $4,%xmm10,%xmm10 + vpclmulqdq $0x10,%xmm14,%xmm11,%xmm14 + vpslldq $4,%xmm14,%xmm14 + + vpternlogq $0x96,%xmm10,%xmm7,%xmm14 + + movq $0,(%rdx) + + movq %r11,%r12 + movq $16,%r11 + subq %r12,%r11 + jmp .L_enc_dec_done_jdCiCmGpmghGfDo + +.L_partial_incomplete_jdCiCmGpmghGfDo: + addq %r8,(%rdx) + movq %r8,%r11 + +.L_enc_dec_done_jdCiCmGpmghGfDo: + + + leaq byte_len_to_mask_table(%rip),%r12 + kmovw (%r12,%r11,2),%k1 + vmovdqu64 %xmm14,64(%rsi) + + vpshufb SHUF_MASK(%rip),%xmm3,%xmm3 + vpshufb %xmm5,%xmm3,%xmm3 + movq %r9,%r12 + vmovdqu8 %xmm3,(%r12){%k1} +.L_partial_block_done_jdCiCmGpmghGfDo: + vmovdqu64 0(%rsi),%xmm2 + subq %r11,%r8 + je .L_enc_dec_done_tFbkipsuzBAeEGF + cmpq $256,%r8 + jbe .L_message_below_equal_16_blocks_tFbkipsuzBAeEGF + + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vmovdqa64 ddq_addbe_4444(%rip),%zmm27 + vmovdqa64 ddq_addbe_1234(%rip),%zmm28 + + + + + + + vmovd %xmm2,%r15d + andl $255,%r15d + + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpshufb %zmm29,%zmm2,%zmm2 + + + + cmpb $240,%r15b + jae .L_next_16_overflow_pFvraahbaffuyct + vpaddd %zmm28,%zmm2,%zmm7 + vpaddd %zmm27,%zmm7,%zmm10 + vpaddd %zmm27,%zmm10,%zmm11 + vpaddd %zmm27,%zmm11,%zmm12 + jmp .L_next_16_ok_pFvraahbaffuyct +.L_next_16_overflow_pFvraahbaffuyct: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm12 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm7 + vpaddd %zmm12,%zmm7,%zmm10 + vpaddd %zmm12,%zmm10,%zmm11 + vpaddd %zmm12,%zmm11,%zmm12 + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %zmm29,%zmm10,%zmm10 + vpshufb %zmm29,%zmm11,%zmm11 + vpshufb %zmm29,%zmm12,%zmm12 +.L_next_16_ok_pFvraahbaffuyct: + vshufi64x2 $255,%zmm12,%zmm12,%zmm2 + addb $16,%r15b + + vmovdqu8 0(%rcx,%r11,1),%zmm0 + vmovdqu8 64(%rcx,%r11,1),%zmm3 + vmovdqu8 128(%rcx,%r11,1),%zmm4 + vmovdqu8 192(%rcx,%r11,1),%zmm5 + + + vbroadcastf64x2 0(%rdi),%zmm6 + vpxorq %zmm6,%zmm7,%zmm7 + vpxorq %zmm6,%zmm10,%zmm10 + vpxorq %zmm6,%zmm11,%zmm11 + vpxorq %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 16(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 32(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 48(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 64(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 80(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 96(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 112(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 128(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 144(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 160(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 176(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 192(%rdi),%zmm6 + vaesenclast %zmm6,%zmm7,%zmm7 + vaesenclast %zmm6,%zmm10,%zmm10 + vaesenclast %zmm6,%zmm11,%zmm11 + vaesenclast %zmm6,%zmm12,%zmm12 + + + vpxorq %zmm0,%zmm7,%zmm7 + vpxorq %zmm3,%zmm10,%zmm10 + vpxorq %zmm4,%zmm11,%zmm11 + vpxorq %zmm5,%zmm12,%zmm12 + + + movq %r9,%r10 + vmovdqu8 %zmm7,0(%r10,%r11,1) + vmovdqu8 %zmm10,64(%r10,%r11,1) + vmovdqu8 %zmm11,128(%r10,%r11,1) + vmovdqu8 %zmm12,192(%r10,%r11,1) + + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %zmm29,%zmm10,%zmm10 + vpshufb %zmm29,%zmm11,%zmm11 + vpshufb %zmm29,%zmm12,%zmm12 + vmovdqa64 %zmm7,768(%rsp) + vmovdqa64 %zmm10,832(%rsp) + vmovdqa64 %zmm11,896(%rsp) + vmovdqa64 %zmm12,960(%rsp) + testq %r14,%r14 + jnz .L_skip_hkeys_precomputation_yenzjhtagtpjklu + + vmovdqu64 288(%rsi),%zmm0 + vmovdqu64 %zmm0,704(%rsp) + + vmovdqu64 224(%rsi),%zmm3 + vmovdqu64 %zmm3,640(%rsp) + + + vshufi64x2 $0x00,%zmm3,%zmm3,%zmm3 + + vmovdqu64 160(%rsi),%zmm4 + vmovdqu64 %zmm4,576(%rsp) + + vmovdqu64 96(%rsi),%zmm5 + vmovdqu64 %zmm5,512(%rsp) +.L_skip_hkeys_precomputation_yenzjhtagtpjklu: + cmpq $512,%r8 + jb .L_message_below_32_blocks_tFbkipsuzBAeEGF + + + + cmpb $240,%r15b + jae .L_next_16_overflow_enCpGzovkqzhwzc + vpaddd %zmm28,%zmm2,%zmm7 + vpaddd %zmm27,%zmm7,%zmm10 + vpaddd %zmm27,%zmm10,%zmm11 + vpaddd %zmm27,%zmm11,%zmm12 + jmp .L_next_16_ok_enCpGzovkqzhwzc +.L_next_16_overflow_enCpGzovkqzhwzc: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm12 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm7 + vpaddd %zmm12,%zmm7,%zmm10 + vpaddd %zmm12,%zmm10,%zmm11 + vpaddd %zmm12,%zmm11,%zmm12 + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %zmm29,%zmm10,%zmm10 + vpshufb %zmm29,%zmm11,%zmm11 + vpshufb %zmm29,%zmm12,%zmm12 +.L_next_16_ok_enCpGzovkqzhwzc: + vshufi64x2 $255,%zmm12,%zmm12,%zmm2 + addb $16,%r15b + + vmovdqu8 256(%rcx,%r11,1),%zmm0 + vmovdqu8 320(%rcx,%r11,1),%zmm3 + vmovdqu8 384(%rcx,%r11,1),%zmm4 + vmovdqu8 448(%rcx,%r11,1),%zmm5 + + + vbroadcastf64x2 0(%rdi),%zmm6 + vpxorq %zmm6,%zmm7,%zmm7 + vpxorq %zmm6,%zmm10,%zmm10 + vpxorq %zmm6,%zmm11,%zmm11 + vpxorq %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 16(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 32(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 48(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 64(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 80(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 96(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 112(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 128(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 144(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 160(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 176(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 192(%rdi),%zmm6 + vaesenclast %zmm6,%zmm7,%zmm7 + vaesenclast %zmm6,%zmm10,%zmm10 + vaesenclast %zmm6,%zmm11,%zmm11 + vaesenclast %zmm6,%zmm12,%zmm12 + + + vpxorq %zmm0,%zmm7,%zmm7 + vpxorq %zmm3,%zmm10,%zmm10 + vpxorq %zmm4,%zmm11,%zmm11 + vpxorq %zmm5,%zmm12,%zmm12 + + + movq %r9,%r10 + vmovdqu8 %zmm7,256(%r10,%r11,1) + vmovdqu8 %zmm10,320(%r10,%r11,1) + vmovdqu8 %zmm11,384(%r10,%r11,1) + vmovdqu8 %zmm12,448(%r10,%r11,1) + + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %zmm29,%zmm10,%zmm10 + vpshufb %zmm29,%zmm11,%zmm11 + vpshufb %zmm29,%zmm12,%zmm12 + vmovdqa64 %zmm7,1024(%rsp) + vmovdqa64 %zmm10,1088(%rsp) + vmovdqa64 %zmm11,1152(%rsp) + vmovdqa64 %zmm12,1216(%rsp) + testq %r14,%r14 + jnz .L_skip_hkeys_precomputation_jqGvtcbttbiaDxy + vmovdqu64 640(%rsp),%zmm3 + + + vshufi64x2 $0x00,%zmm3,%zmm3,%zmm3 + + vmovdqu64 576(%rsp),%zmm4 + vmovdqu64 512(%rsp),%zmm5 + + vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4 + vpxorq %zmm10,%zmm4,%zmm4 + + vpsrldq $8,%zmm4,%zmm10 + vpslldq $8,%zmm4,%zmm4 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4 + vpslldq $4,%zmm4,%zmm4 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm4 + + vmovdqu64 %zmm4,448(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5 + vpxorq %zmm10,%zmm5,%zmm5 + + vpsrldq $8,%zmm5,%zmm10 + vpslldq $8,%zmm5,%zmm5 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5 + vpslldq $4,%zmm5,%zmm5 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm5 + + vmovdqu64 %zmm5,384(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4 + vpxorq %zmm10,%zmm4,%zmm4 + + vpsrldq $8,%zmm4,%zmm10 + vpslldq $8,%zmm4,%zmm4 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4 + vpslldq $4,%zmm4,%zmm4 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm4 + + vmovdqu64 %zmm4,320(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5 + vpxorq %zmm10,%zmm5,%zmm5 + + vpsrldq $8,%zmm5,%zmm10 + vpslldq $8,%zmm5,%zmm5 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5 + vpslldq $4,%zmm5,%zmm5 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm5 + + vmovdqu64 %zmm5,256(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4 + vpxorq %zmm10,%zmm4,%zmm4 + + vpsrldq $8,%zmm4,%zmm10 + vpslldq $8,%zmm4,%zmm4 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4 + vpslldq $4,%zmm4,%zmm4 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm4 + + vmovdqu64 %zmm4,192(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5 + vpxorq %zmm10,%zmm5,%zmm5 + + vpsrldq $8,%zmm5,%zmm10 + vpslldq $8,%zmm5,%zmm5 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5 + vpslldq $4,%zmm5,%zmm5 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm5 + + vmovdqu64 %zmm5,128(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4 + vpxorq %zmm10,%zmm4,%zmm4 + + vpsrldq $8,%zmm4,%zmm10 + vpslldq $8,%zmm4,%zmm4 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4 + vpslldq $4,%zmm4,%zmm4 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm4 + + vmovdqu64 %zmm4,64(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5 + vpxorq %zmm10,%zmm5,%zmm5 + + vpsrldq $8,%zmm5,%zmm10 + vpslldq $8,%zmm5,%zmm5 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5 + vpslldq $4,%zmm5,%zmm5 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm5 + + vmovdqu64 %zmm5,0(%rsp) +.L_skip_hkeys_precomputation_jqGvtcbttbiaDxy: + movq $1,%r14 + addq $512,%r11 + subq $512,%r8 + + cmpq $768,%r8 + jb .L_no_more_big_nblocks_tFbkipsuzBAeEGF +.L_encrypt_big_nblocks_tFbkipsuzBAeEGF: + cmpb $240,%r15b + jae .L_16_blocks_overflow_jddBEjFhbsBAmmE + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_jddBEjFhbsBAmmE +.L_16_blocks_overflow_jddBEjFhbsBAmmE: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_jddBEjFhbsBAmmE: + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp),%zmm1 + + + + + vshufi64x2 $255,%zmm5,%zmm5,%zmm2 + addb $16,%r15b + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + + + + + + + + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm6 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + + + + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm15,%zmm10,%zmm26 + vpxorq %zmm12,%zmm6,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + + + + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + + + + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1) + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 + vmovdqa64 %zmm0,1280(%rsp) + vmovdqa64 %zmm3,1344(%rsp) + vmovdqa64 %zmm4,1408(%rsp) + vmovdqa64 %zmm5,1472(%rsp) + cmpb $240,%r15b + jae .L_16_blocks_overflow_idpAqFqszdhymlh + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_idpAqFqszdhymlh +.L_16_blocks_overflow_idpAqFqszdhymlh: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_idpAqFqszdhymlh: + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 256(%rsp),%zmm1 + + + + + vshufi64x2 $255,%zmm5,%zmm5,%zmm2 + addb $16,%r15b + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 320(%rsp),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + + + + + + + + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 384(%rsp),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 448(%rsp),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm6 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + + + + vmovdqu8 256(%rcx,%r11,1),%zmm17 + vmovdqu8 320(%rcx,%r11,1),%zmm19 + vmovdqu8 384(%rcx,%r11,1),%zmm20 + vmovdqu8 448(%rcx,%r11,1),%zmm21 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vpternlogq $0x96,%zmm12,%zmm6,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + + + + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + + + + movq %r9,%r10 + vmovdqu8 %zmm0,256(%r10,%r11,1) + vmovdqu8 %zmm3,320(%r10,%r11,1) + vmovdqu8 %zmm4,384(%r10,%r11,1) + vmovdqu8 %zmm5,448(%r10,%r11,1) + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 + vmovdqa64 %zmm0,768(%rsp) + vmovdqa64 %zmm3,832(%rsp) + vmovdqa64 %zmm4,896(%rsp) + vmovdqa64 %zmm5,960(%rsp) + cmpb $240,%r15b + jae .L_16_blocks_overflow_EFGAxoobnnGywoA + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_EFGAxoobnnGywoA +.L_16_blocks_overflow_EFGAxoobnnGywoA: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_EFGAxoobnnGywoA: + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + + + + + vshufi64x2 $255,%zmm5,%zmm5,%zmm2 + addb $16,%r15b + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + + + + + + + + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm6 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + + + + vmovdqu8 512(%rcx,%r11,1),%zmm17 + vmovdqu8 576(%rcx,%r11,1),%zmm19 + vmovdqu8 640(%rcx,%r11,1),%zmm20 + vmovdqu8 704(%rcx,%r11,1),%zmm21 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + + + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vpternlogq $0x96,%zmm15,%zmm12,%zmm6 + vpxorq %zmm24,%zmm6,%zmm6 + vpternlogq $0x96,%zmm10,%zmm13,%zmm7 + vpxorq %zmm25,%zmm7,%zmm7 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vextracti64x4 $1,%zmm6,%ymm12 + vpxorq %ymm12,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm12 + vpxorq %xmm12,%xmm6,%xmm6 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm6 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + + + + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + + + + movq %r9,%r10 + vmovdqu8 %zmm0,512(%r10,%r11,1) + vmovdqu8 %zmm3,576(%r10,%r11,1) + vmovdqu8 %zmm4,640(%r10,%r11,1) + vmovdqu8 %zmm5,704(%r10,%r11,1) + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 + vmovdqa64 %zmm0,1024(%rsp) + vmovdqa64 %zmm3,1088(%rsp) + vmovdqa64 %zmm4,1152(%rsp) + vmovdqa64 %zmm5,1216(%rsp) + vmovdqa64 %zmm6,%zmm14 + + addq $768,%r11 + subq $768,%r8 + cmpq $768,%r8 + jae .L_encrypt_big_nblocks_tFbkipsuzBAeEGF + +.L_no_more_big_nblocks_tFbkipsuzBAeEGF: + + cmpq $512,%r8 + jae .L_encrypt_32_blocks_tFbkipsuzBAeEGF + + cmpq $256,%r8 + jae .L_encrypt_16_blocks_tFbkipsuzBAeEGF +.L_encrypt_0_blocks_ghash_32_tFbkipsuzBAeEGF: + movl %r8d,%r10d + andl $~15,%r10d + movl $256,%ebx + subl %r10d,%ebx + vmovdqa64 768(%rsp),%zmm13 + vpxorq %zmm14,%zmm13,%zmm13 + vmovdqu64 0(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 832(%rsp),%zmm13 + vmovdqu64 64(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpxorq %zmm10,%zmm4,%zmm26 + vpxorq %zmm6,%zmm0,%zmm24 + vpxorq %zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 896(%rsp),%zmm13 + vmovdqu64 128(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 960(%rsp),%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + addl $256,%ebx + movl %r8d,%r10d + addl $15,%r10d + shrl $4,%r10d + je .L_last_num_blocks_is_0_uFjiwCxmGEbfAFa + + cmpl $8,%r10d + je .L_last_num_blocks_is_8_uFjiwCxmGEbfAFa + jb .L_last_num_blocks_is_7_1_uFjiwCxmGEbfAFa + + + cmpl $12,%r10d + je .L_last_num_blocks_is_12_uFjiwCxmGEbfAFa + jb .L_last_num_blocks_is_11_9_uFjiwCxmGEbfAFa + + + cmpl $15,%r10d + je .L_last_num_blocks_is_15_uFjiwCxmGEbfAFa + ja .L_last_num_blocks_is_16_uFjiwCxmGEbfAFa + cmpl $14,%r10d + je .L_last_num_blocks_is_14_uFjiwCxmGEbfAFa + jmp .L_last_num_blocks_is_13_uFjiwCxmGEbfAFa + +.L_last_num_blocks_is_11_9_uFjiwCxmGEbfAFa: + + cmpl $10,%r10d + je .L_last_num_blocks_is_10_uFjiwCxmGEbfAFa + ja .L_last_num_blocks_is_11_uFjiwCxmGEbfAFa + jmp .L_last_num_blocks_is_9_uFjiwCxmGEbfAFa + +.L_last_num_blocks_is_7_1_uFjiwCxmGEbfAFa: + cmpl $4,%r10d + je .L_last_num_blocks_is_4_uFjiwCxmGEbfAFa + jb .L_last_num_blocks_is_3_1_uFjiwCxmGEbfAFa + + cmpl $6,%r10d + ja .L_last_num_blocks_is_7_uFjiwCxmGEbfAFa + je .L_last_num_blocks_is_6_uFjiwCxmGEbfAFa + jmp .L_last_num_blocks_is_5_uFjiwCxmGEbfAFa + +.L_last_num_blocks_is_3_1_uFjiwCxmGEbfAFa: + + cmpl $2,%r10d + ja .L_last_num_blocks_is_3_uFjiwCxmGEbfAFa + je .L_last_num_blocks_is_2_uFjiwCxmGEbfAFa +.L_last_num_blocks_is_1_uFjiwCxmGEbfAFa: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $255,%r15d + jae .L_16_blocks_overflow_lxdjeCteCnqypuE + vpaddd %xmm28,%xmm2,%xmm0 + jmp .L_16_blocks_ok_lxdjeCteCnqypuE + +.L_16_blocks_overflow_lxdjeCteCnqypuE: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %xmm29,%xmm0,%xmm0 +.L_16_blocks_ok_lxdjeCteCnqypuE: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z} + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vaesenclast %xmm30,%xmm0,%xmm0 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti32x4 $0,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %xmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %xmm29,%xmm0,%xmm17 + vextracti32x4 $0,%zmm17,%xmm7 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_xokBAycvbkevxfE + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_xokBAycvbkevxfE +.L_small_initial_partial_block_xokBAycvbkevxfE: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm0 + + + vpclmulqdq $0x01,%xmm25,%xmm0,%xmm3 + vpslldq $8,%xmm3,%xmm3 + vpxorq %xmm3,%xmm25,%xmm3 + + + vpclmulqdq $0x00,%xmm3,%xmm0,%xmm4 + vpsrldq $4,%xmm4,%xmm4 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm4,%xmm14 + + + + + + + + + + + + + vpxorq %xmm7,%xmm14,%xmm14 + + jmp .L_after_reduction_xokBAycvbkevxfE +.L_small_initial_compute_done_xokBAycvbkevxfE: +.L_after_reduction_xokBAycvbkevxfE: + jmp .L_last_blocks_done_uFjiwCxmGEbfAFa +.L_last_num_blocks_is_2_uFjiwCxmGEbfAFa: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $254,%r15d + jae .L_16_blocks_overflow_tqAdjGAqcxebbGj + vpaddd %ymm28,%ymm2,%ymm0 + jmp .L_16_blocks_ok_tqAdjGAqcxebbGj + +.L_16_blocks_overflow_tqAdjGAqcxebbGj: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %ymm29,%ymm0,%ymm0 +.L_16_blocks_ok_tqAdjGAqcxebbGj: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z} + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vaesenclast %ymm30,%ymm0,%ymm0 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %ymm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %ymm29,%ymm0,%ymm17 + vextracti32x4 $1,%zmm17,%xmm7 + subq $16 * (2 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_izsjBCvaDivghqe + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_izsjBCvaDivghqe +.L_small_initial_partial_block_izsjBCvaDivghqe: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_izsjBCvaDivghqe: + + orq %r8,%r8 + je .L_after_reduction_izsjBCvaDivghqe + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_izsjBCvaDivghqe: + jmp .L_last_blocks_done_uFjiwCxmGEbfAFa +.L_last_num_blocks_is_3_uFjiwCxmGEbfAFa: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $253,%r15d + jae .L_16_blocks_overflow_woFDjhpeDAEyeol + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_woFDjhpeDAEyeol + +.L_16_blocks_overflow_woFDjhpeDAEyeol: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_woFDjhpeDAEyeol: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $2,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vextracti32x4 $2,%zmm17,%xmm7 + subq $16 * (3 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_AqCFGymmhaacFDC + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_AqCFGymmhaacFDC +.L_small_initial_partial_block_AqCFGymmhaacFDC: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_AqCFGymmhaacFDC: + + orq %r8,%r8 + je .L_after_reduction_AqCFGymmhaacFDC + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_AqCFGymmhaacFDC: + jmp .L_last_blocks_done_uFjiwCxmGEbfAFa +.L_last_num_blocks_is_4_uFjiwCxmGEbfAFa: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $252,%r15d + jae .L_16_blocks_overflow_BGnDrgfdztzmBGB + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_BGnDrgfdztzmBGB + +.L_16_blocks_overflow_BGnDrgfdztzmBGB: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_BGnDrgfdztzmBGB: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $3,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vextracti32x4 $3,%zmm17,%xmm7 + subq $16 * (4 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_uClitrxBorxFyuy + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_uClitrxBorxFyuy +.L_small_initial_partial_block_uClitrxBorxFyuy: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_uClitrxBorxFyuy: + + orq %r8,%r8 + je .L_after_reduction_uClitrxBorxFyuy + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_uClitrxBorxFyuy: + jmp .L_last_blocks_done_uFjiwCxmGEbfAFa +.L_last_num_blocks_is_5_uFjiwCxmGEbfAFa: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $251,%r15d + jae .L_16_blocks_overflow_wDxAmusyyammDow + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %xmm27,%xmm0,%xmm3 + jmp .L_16_blocks_ok_wDxAmusyyammDow + +.L_16_blocks_overflow_wDxAmusyyammDow: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %xmm29,%xmm3,%xmm3 +.L_16_blocks_ok_wDxAmusyyammDow: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %xmm30,%xmm3,%xmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %xmm19,%xmm3,%xmm3 + vextracti32x4 $0,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %xmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %xmm29,%xmm3,%xmm19 + vextracti32x4 $0,%zmm19,%xmm7 + subq $16 * (5 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_bosguzEFytqmFeq + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_bosguzEFytqmFeq +.L_small_initial_partial_block_bosguzEFytqmFeq: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_bosguzEFytqmFeq: + + orq %r8,%r8 + je .L_after_reduction_bosguzEFytqmFeq + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_bosguzEFytqmFeq: + jmp .L_last_blocks_done_uFjiwCxmGEbfAFa +.L_last_num_blocks_is_6_uFjiwCxmGEbfAFa: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $250,%r15d + jae .L_16_blocks_overflow_sCzAAgptixxBvip + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %ymm27,%ymm0,%ymm3 + jmp .L_16_blocks_ok_sCzAAgptixxBvip + +.L_16_blocks_overflow_sCzAAgptixxBvip: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %ymm29,%ymm3,%ymm3 +.L_16_blocks_ok_sCzAAgptixxBvip: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %ymm30,%ymm3,%ymm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %ymm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %ymm29,%ymm3,%ymm19 + vextracti32x4 $1,%zmm19,%xmm7 + subq $16 * (6 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_FuuimCCibwFkhfx + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_FuuimCCibwFkhfx +.L_small_initial_partial_block_FuuimCCibwFkhfx: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_FuuimCCibwFkhfx: + + orq %r8,%r8 + je .L_after_reduction_FuuimCCibwFkhfx + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_FuuimCCibwFkhfx: + jmp .L_last_blocks_done_uFjiwCxmGEbfAFa +.L_last_num_blocks_is_7_uFjiwCxmGEbfAFa: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $249,%r15d + jae .L_16_blocks_overflow_gqtukwixiotlvjE + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_gqtukwixiotlvjE + +.L_16_blocks_overflow_gqtukwixiotlvjE: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_gqtukwixiotlvjE: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $2,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vextracti32x4 $2,%zmm19,%xmm7 + subq $16 * (7 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_CBkCykisCgChyAc + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_CBkCykisCgChyAc +.L_small_initial_partial_block_CBkCykisCgChyAc: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_CBkCykisCgChyAc: + + orq %r8,%r8 + je .L_after_reduction_CBkCykisCgChyAc + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_CBkCykisCgChyAc: + jmp .L_last_blocks_done_uFjiwCxmGEbfAFa +.L_last_num_blocks_is_8_uFjiwCxmGEbfAFa: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $248,%r15d + jae .L_16_blocks_overflow_Fznlwzcrirmvwxw + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_Fznlwzcrirmvwxw + +.L_16_blocks_overflow_Fznlwzcrirmvwxw: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_Fznlwzcrirmvwxw: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $3,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vextracti32x4 $3,%zmm19,%xmm7 + subq $16 * (8 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_BszjzgFAnDlqhlr + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_BszjzgFAnDlqhlr +.L_small_initial_partial_block_BszjzgFAnDlqhlr: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_BszjzgFAnDlqhlr: + + orq %r8,%r8 + je .L_after_reduction_BszjzgFAnDlqhlr + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_BszjzgFAnDlqhlr: + jmp .L_last_blocks_done_uFjiwCxmGEbfAFa +.L_last_num_blocks_is_9_uFjiwCxmGEbfAFa: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $247,%r15d + jae .L_16_blocks_overflow_nhcklxyaumrucBe + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %xmm27,%xmm3,%xmm4 + jmp .L_16_blocks_ok_nhcklxyaumrucBe + +.L_16_blocks_overflow_nhcklxyaumrucBe: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %xmm29,%xmm4,%xmm4 +.L_16_blocks_ok_nhcklxyaumrucBe: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %xmm30,%xmm4,%xmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %xmm20,%xmm4,%xmm4 + vextracti32x4 $0,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %xmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %xmm29,%xmm4,%xmm20 + vextracti32x4 $0,%zmm20,%xmm7 + subq $16 * (9 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_pofwkmqmhmpaDas + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_pofwkmqmhmpaDas +.L_small_initial_partial_block_pofwkmqmhmpaDas: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_pofwkmqmhmpaDas: + + orq %r8,%r8 + je .L_after_reduction_pofwkmqmhmpaDas + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_pofwkmqmhmpaDas: + jmp .L_last_blocks_done_uFjiwCxmGEbfAFa +.L_last_num_blocks_is_10_uFjiwCxmGEbfAFa: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $246,%r15d + jae .L_16_blocks_overflow_DpcajcwBdqbwuEm + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %ymm27,%ymm3,%ymm4 + jmp .L_16_blocks_ok_DpcajcwBdqbwuEm + +.L_16_blocks_overflow_DpcajcwBdqbwuEm: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %ymm29,%ymm4,%ymm4 +.L_16_blocks_ok_DpcajcwBdqbwuEm: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %ymm30,%ymm4,%ymm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %ymm20,%ymm4,%ymm4 + vextracti32x4 $1,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %ymm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %ymm29,%ymm4,%ymm20 + vextracti32x4 $1,%zmm20,%xmm7 + subq $16 * (10 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_GoickdlxxlCgCmn + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_GoickdlxxlCgCmn +.L_small_initial_partial_block_GoickdlxxlCgCmn: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_GoickdlxxlCgCmn: + + orq %r8,%r8 + je .L_after_reduction_GoickdlxxlCgCmn + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_GoickdlxxlCgCmn: + jmp .L_last_blocks_done_uFjiwCxmGEbfAFa +.L_last_num_blocks_is_11_uFjiwCxmGEbfAFa: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $245,%r15d + jae .L_16_blocks_overflow_CzDGlzuDofcmftE + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_CzDGlzuDofcmftE + +.L_16_blocks_overflow_CzDGlzuDofcmftE: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_CzDGlzuDofcmftE: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $2,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vextracti32x4 $2,%zmm20,%xmm7 + subq $16 * (11 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_AfGwErudvfGFkBd + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_AfGwErudvfGFkBd +.L_small_initial_partial_block_AfGwErudvfGFkBd: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_AfGwErudvfGFkBd: + + orq %r8,%r8 + je .L_after_reduction_AfGwErudvfGFkBd + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_AfGwErudvfGFkBd: + jmp .L_last_blocks_done_uFjiwCxmGEbfAFa +.L_last_num_blocks_is_12_uFjiwCxmGEbfAFa: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $244,%r15d + jae .L_16_blocks_overflow_vFgtdmiGGceAuup + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_vFgtdmiGGceAuup + +.L_16_blocks_overflow_vFgtdmiGGceAuup: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_vFgtdmiGGceAuup: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $3,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vextracti32x4 $3,%zmm20,%xmm7 + subq $16 * (12 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_hAugcokFGbhzzvx + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_hAugcokFGbhzzvx +.L_small_initial_partial_block_hAugcokFGbhzzvx: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_hAugcokFGbhzzvx: + + orq %r8,%r8 + je .L_after_reduction_hAugcokFGbhzzvx + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_hAugcokFGbhzzvx: + jmp .L_last_blocks_done_uFjiwCxmGEbfAFa +.L_last_num_blocks_is_13_uFjiwCxmGEbfAFa: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $243,%r15d + jae .L_16_blocks_overflow_ApsFAharcbobqcA + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %xmm27,%xmm4,%xmm5 + jmp .L_16_blocks_ok_ApsFAharcbobqcA + +.L_16_blocks_overflow_ApsFAharcbobqcA: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %xmm29,%xmm5,%xmm5 +.L_16_blocks_ok_ApsFAharcbobqcA: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %xmm30,%xmm5,%xmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %xmm21,%xmm5,%xmm5 + vextracti32x4 $0,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %xmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %xmm29,%xmm5,%xmm21 + vextracti32x4 $0,%zmm21,%xmm7 + subq $16 * (13 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_DkdftFtqeikgrDl + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_DkdftFtqeikgrDl +.L_small_initial_partial_block_DkdftFtqeikgrDl: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_DkdftFtqeikgrDl: + + orq %r8,%r8 + je .L_after_reduction_DkdftFtqeikgrDl + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_DkdftFtqeikgrDl: + jmp .L_last_blocks_done_uFjiwCxmGEbfAFa +.L_last_num_blocks_is_14_uFjiwCxmGEbfAFa: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $242,%r15d + jae .L_16_blocks_overflow_amhEEFGkEmcdfyg + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %ymm27,%ymm4,%ymm5 + jmp .L_16_blocks_ok_amhEEFGkEmcdfyg + +.L_16_blocks_overflow_amhEEFGkEmcdfyg: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %ymm29,%ymm5,%ymm5 +.L_16_blocks_ok_amhEEFGkEmcdfyg: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %ymm30,%ymm5,%ymm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %ymm21,%ymm5,%ymm5 + vextracti32x4 $1,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %ymm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %ymm29,%ymm5,%ymm21 + vextracti32x4 $1,%zmm21,%xmm7 + subq $16 * (14 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_DsqdvjyjtgiDdjk + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_DsqdvjyjtgiDdjk +.L_small_initial_partial_block_DsqdvjyjtgiDdjk: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_DsqdvjyjtgiDdjk: + + orq %r8,%r8 + je .L_after_reduction_DsqdvjyjtgiDdjk + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_DsqdvjyjtgiDdjk: + jmp .L_last_blocks_done_uFjiwCxmGEbfAFa +.L_last_num_blocks_is_15_uFjiwCxmGEbfAFa: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $241,%r15d + jae .L_16_blocks_overflow_GyCmDqABriaxjxf + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_GyCmDqABriaxjxf + +.L_16_blocks_overflow_GyCmDqABriaxjxf: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_GyCmDqABriaxjxf: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $2,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %zmm29,%zmm5,%zmm21 + vextracti32x4 $2,%zmm21,%xmm7 + subq $16 * (15 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_pGoiupmcfezlCDb + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_pGoiupmcfezlCDb +.L_small_initial_partial_block_pGoiupmcfezlCDb: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_pGoiupmcfezlCDb: + + orq %r8,%r8 + je .L_after_reduction_pGoiupmcfezlCDb + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_pGoiupmcfezlCDb: + jmp .L_last_blocks_done_uFjiwCxmGEbfAFa +.L_last_num_blocks_is_16_uFjiwCxmGEbfAFa: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $240,%r15d + jae .L_16_blocks_overflow_imDahqossjyafvG + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_imDahqossjyafvG + +.L_16_blocks_overflow_imDahqossjyafvG: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_imDahqossjyafvG: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $3,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %zmm29,%zmm5,%zmm21 + vextracti32x4 $3,%zmm21,%xmm7 + subq $16 * (16 - 1),%r8 +.L_small_initial_partial_block_stpCjmquwqkvlEu: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_stpCjmquwqkvlEu: + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_stpCjmquwqkvlEu: + jmp .L_last_blocks_done_uFjiwCxmGEbfAFa +.L_last_num_blocks_is_0_uFjiwCxmGEbfAFa: + vmovdqa64 1024(%rsp),%zmm13 + vmovdqu64 0(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1088(%rsp),%zmm13 + vmovdqu64 64(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 1152(%rsp),%zmm13 + vmovdqu64 128(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1216(%rsp),%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm4 + + + vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0 + vpslldq $8,%xmm0,%xmm0 + vpxorq %xmm0,%xmm25,%xmm0 + + + vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3 + vpsrldq $4,%xmm3,%xmm3 + vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm3,%xmm14 + +.L_last_blocks_done_uFjiwCxmGEbfAFa: + vpshufb %xmm29,%xmm2,%xmm2 + jmp .L_ghash_done_tFbkipsuzBAeEGF +.L_encrypt_32_blocks_tFbkipsuzBAeEGF: + cmpb $240,%r15b + jae .L_16_blocks_overflow_AGsgmucxjDjGrat + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_AGsgmucxjDjGrat +.L_16_blocks_overflow_AGsgmucxjDjGrat: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_AGsgmucxjDjGrat: + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp),%zmm1 + + + + + vshufi64x2 $255,%zmm5,%zmm5,%zmm2 + addb $16,%r15b + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + + + + + + + + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm6 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + + + + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm15,%zmm10,%zmm26 + vpxorq %zmm12,%zmm6,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + + + + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + + + + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1) + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 + vmovdqa64 %zmm0,1280(%rsp) + vmovdqa64 %zmm3,1344(%rsp) + vmovdqa64 %zmm4,1408(%rsp) + vmovdqa64 %zmm5,1472(%rsp) + cmpb $240,%r15b + jae .L_16_blocks_overflow_miCaCzFgEsdrxCb + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_miCaCzFgEsdrxCb +.L_16_blocks_overflow_miCaCzFgEsdrxCb: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_miCaCzFgEsdrxCb: + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 256(%rsp),%zmm1 + + + + + vshufi64x2 $255,%zmm5,%zmm5,%zmm2 + addb $16,%r15b + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 320(%rsp),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + + + + + + + + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 384(%rsp),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 448(%rsp),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm6 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + + + + vmovdqu8 256(%rcx,%r11,1),%zmm17 + vmovdqu8 320(%rcx,%r11,1),%zmm19 + vmovdqu8 384(%rcx,%r11,1),%zmm20 + vmovdqu8 448(%rcx,%r11,1),%zmm21 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vpternlogq $0x96,%zmm12,%zmm6,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + + + + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + + + + movq %r9,%r10 + vmovdqu8 %zmm0,256(%r10,%r11,1) + vmovdqu8 %zmm3,320(%r10,%r11,1) + vmovdqu8 %zmm4,384(%r10,%r11,1) + vmovdqu8 %zmm5,448(%r10,%r11,1) + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 + vmovdqa64 %zmm0,768(%rsp) + vmovdqa64 %zmm3,832(%rsp) + vmovdqa64 %zmm4,896(%rsp) + vmovdqa64 %zmm5,960(%rsp) + vmovdqa64 1280(%rsp),%zmm13 + vmovdqu64 512(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1344(%rsp),%zmm13 + vmovdqu64 576(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 1408(%rsp),%zmm13 + vmovdqu64 640(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1472(%rsp),%zmm13 + vmovdqu64 704(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm4 + + + vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0 + vpslldq $8,%xmm0,%xmm0 + vpxorq %xmm0,%xmm25,%xmm0 + + + vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3 + vpsrldq $4,%xmm3,%xmm3 + vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm3,%xmm14 + + subq $512,%r8 + addq $512,%r11 + movl %r8d,%r10d + andl $~15,%r10d + movl $512,%ebx + subl %r10d,%ebx + movl %r8d,%r10d + addl $15,%r10d + shrl $4,%r10d + je .L_last_num_blocks_is_0_jcdFbiukBEavFGE + + cmpl $8,%r10d + je .L_last_num_blocks_is_8_jcdFbiukBEavFGE + jb .L_last_num_blocks_is_7_1_jcdFbiukBEavFGE + + + cmpl $12,%r10d + je .L_last_num_blocks_is_12_jcdFbiukBEavFGE + jb .L_last_num_blocks_is_11_9_jcdFbiukBEavFGE + + + cmpl $15,%r10d + je .L_last_num_blocks_is_15_jcdFbiukBEavFGE + ja .L_last_num_blocks_is_16_jcdFbiukBEavFGE + cmpl $14,%r10d + je .L_last_num_blocks_is_14_jcdFbiukBEavFGE + jmp .L_last_num_blocks_is_13_jcdFbiukBEavFGE + +.L_last_num_blocks_is_11_9_jcdFbiukBEavFGE: + + cmpl $10,%r10d + je .L_last_num_blocks_is_10_jcdFbiukBEavFGE + ja .L_last_num_blocks_is_11_jcdFbiukBEavFGE + jmp .L_last_num_blocks_is_9_jcdFbiukBEavFGE + +.L_last_num_blocks_is_7_1_jcdFbiukBEavFGE: + cmpl $4,%r10d + je .L_last_num_blocks_is_4_jcdFbiukBEavFGE + jb .L_last_num_blocks_is_3_1_jcdFbiukBEavFGE + + cmpl $6,%r10d + ja .L_last_num_blocks_is_7_jcdFbiukBEavFGE + je .L_last_num_blocks_is_6_jcdFbiukBEavFGE + jmp .L_last_num_blocks_is_5_jcdFbiukBEavFGE + +.L_last_num_blocks_is_3_1_jcdFbiukBEavFGE: + + cmpl $2,%r10d + ja .L_last_num_blocks_is_3_jcdFbiukBEavFGE + je .L_last_num_blocks_is_2_jcdFbiukBEavFGE +.L_last_num_blocks_is_1_jcdFbiukBEavFGE: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $255,%r15d + jae .L_16_blocks_overflow_vxxnDcnfkrwsdjp + vpaddd %xmm28,%xmm2,%xmm0 + jmp .L_16_blocks_ok_vxxnDcnfkrwsdjp + +.L_16_blocks_overflow_vxxnDcnfkrwsdjp: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %xmm29,%xmm0,%xmm0 +.L_16_blocks_ok_vxxnDcnfkrwsdjp: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z} + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vaesenclast %xmm30,%xmm0,%xmm0 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti32x4 $0,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %xmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %xmm29,%xmm0,%xmm17 + vextracti32x4 $0,%zmm17,%xmm7 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_rjcmxpckvzxcizE + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_rjcmxpckvzxcizE +.L_small_initial_partial_block_rjcmxpckvzxcizE: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm0 + + + vpclmulqdq $0x01,%xmm25,%xmm0,%xmm3 + vpslldq $8,%xmm3,%xmm3 + vpxorq %xmm3,%xmm25,%xmm3 + + + vpclmulqdq $0x00,%xmm3,%xmm0,%xmm4 + vpsrldq $4,%xmm4,%xmm4 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm4,%xmm14 + + + + + + + + + + + + + vpxorq %xmm7,%xmm14,%xmm14 + + jmp .L_after_reduction_rjcmxpckvzxcizE +.L_small_initial_compute_done_rjcmxpckvzxcizE: +.L_after_reduction_rjcmxpckvzxcizE: + jmp .L_last_blocks_done_jcdFbiukBEavFGE +.L_last_num_blocks_is_2_jcdFbiukBEavFGE: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $254,%r15d + jae .L_16_blocks_overflow_uhDoynhcngzlgum + vpaddd %ymm28,%ymm2,%ymm0 + jmp .L_16_blocks_ok_uhDoynhcngzlgum + +.L_16_blocks_overflow_uhDoynhcngzlgum: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %ymm29,%ymm0,%ymm0 +.L_16_blocks_ok_uhDoynhcngzlgum: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z} + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vaesenclast %ymm30,%ymm0,%ymm0 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %ymm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %ymm29,%ymm0,%ymm17 + vextracti32x4 $1,%zmm17,%xmm7 + subq $16 * (2 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_uukoDhouhnxbvBs + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_uukoDhouhnxbvBs +.L_small_initial_partial_block_uukoDhouhnxbvBs: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_uukoDhouhnxbvBs: + + orq %r8,%r8 + je .L_after_reduction_uukoDhouhnxbvBs + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_uukoDhouhnxbvBs: + jmp .L_last_blocks_done_jcdFbiukBEavFGE +.L_last_num_blocks_is_3_jcdFbiukBEavFGE: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $253,%r15d + jae .L_16_blocks_overflow_uqbvqDscdfzCyvo + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_uqbvqDscdfzCyvo + +.L_16_blocks_overflow_uqbvqDscdfzCyvo: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_uqbvqDscdfzCyvo: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $2,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vextracti32x4 $2,%zmm17,%xmm7 + subq $16 * (3 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_AzBBwGideFptDwf + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_AzBBwGideFptDwf +.L_small_initial_partial_block_AzBBwGideFptDwf: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_AzBBwGideFptDwf: + + orq %r8,%r8 + je .L_after_reduction_AzBBwGideFptDwf + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_AzBBwGideFptDwf: + jmp .L_last_blocks_done_jcdFbiukBEavFGE +.L_last_num_blocks_is_4_jcdFbiukBEavFGE: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $252,%r15d + jae .L_16_blocks_overflow_kyFozElpAosldpA + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_kyFozElpAosldpA + +.L_16_blocks_overflow_kyFozElpAosldpA: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_kyFozElpAosldpA: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $3,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vextracti32x4 $3,%zmm17,%xmm7 + subq $16 * (4 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_cyDyceqdwxjBzzg + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_cyDyceqdwxjBzzg +.L_small_initial_partial_block_cyDyceqdwxjBzzg: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_cyDyceqdwxjBzzg: + + orq %r8,%r8 + je .L_after_reduction_cyDyceqdwxjBzzg + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_cyDyceqdwxjBzzg: + jmp .L_last_blocks_done_jcdFbiukBEavFGE +.L_last_num_blocks_is_5_jcdFbiukBEavFGE: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $251,%r15d + jae .L_16_blocks_overflow_lFprftfcjilzpav + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %xmm27,%xmm0,%xmm3 + jmp .L_16_blocks_ok_lFprftfcjilzpav + +.L_16_blocks_overflow_lFprftfcjilzpav: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %xmm29,%xmm3,%xmm3 +.L_16_blocks_ok_lFprftfcjilzpav: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %xmm30,%xmm3,%xmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %xmm19,%xmm3,%xmm3 + vextracti32x4 $0,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %xmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %xmm29,%xmm3,%xmm19 + vextracti32x4 $0,%zmm19,%xmm7 + subq $16 * (5 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_pGBzEdwhzcavspd + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_pGBzEdwhzcavspd +.L_small_initial_partial_block_pGBzEdwhzcavspd: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_pGBzEdwhzcavspd: + + orq %r8,%r8 + je .L_after_reduction_pGBzEdwhzcavspd + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_pGBzEdwhzcavspd: + jmp .L_last_blocks_done_jcdFbiukBEavFGE +.L_last_num_blocks_is_6_jcdFbiukBEavFGE: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $250,%r15d + jae .L_16_blocks_overflow_GkzjxqDyGdedavo + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %ymm27,%ymm0,%ymm3 + jmp .L_16_blocks_ok_GkzjxqDyGdedavo + +.L_16_blocks_overflow_GkzjxqDyGdedavo: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %ymm29,%ymm3,%ymm3 +.L_16_blocks_ok_GkzjxqDyGdedavo: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %ymm30,%ymm3,%ymm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %ymm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %ymm29,%ymm3,%ymm19 + vextracti32x4 $1,%zmm19,%xmm7 + subq $16 * (6 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_owicnDDzeheGwrB + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_owicnDDzeheGwrB +.L_small_initial_partial_block_owicnDDzeheGwrB: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_owicnDDzeheGwrB: + + orq %r8,%r8 + je .L_after_reduction_owicnDDzeheGwrB + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_owicnDDzeheGwrB: + jmp .L_last_blocks_done_jcdFbiukBEavFGE +.L_last_num_blocks_is_7_jcdFbiukBEavFGE: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $249,%r15d + jae .L_16_blocks_overflow_CaCztGdjulthntc + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_CaCztGdjulthntc + +.L_16_blocks_overflow_CaCztGdjulthntc: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_CaCztGdjulthntc: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $2,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vextracti32x4 $2,%zmm19,%xmm7 + subq $16 * (7 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_davwqylkhqewajl + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_davwqylkhqewajl +.L_small_initial_partial_block_davwqylkhqewajl: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_davwqylkhqewajl: + + orq %r8,%r8 + je .L_after_reduction_davwqylkhqewajl + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_davwqylkhqewajl: + jmp .L_last_blocks_done_jcdFbiukBEavFGE +.L_last_num_blocks_is_8_jcdFbiukBEavFGE: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $248,%r15d + jae .L_16_blocks_overflow_GbaqslwpsaFuoyz + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_GbaqslwpsaFuoyz + +.L_16_blocks_overflow_GbaqslwpsaFuoyz: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_GbaqslwpsaFuoyz: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $3,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vextracti32x4 $3,%zmm19,%xmm7 + subq $16 * (8 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_FelclvrviuByirb + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_FelclvrviuByirb +.L_small_initial_partial_block_FelclvrviuByirb: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_FelclvrviuByirb: + + orq %r8,%r8 + je .L_after_reduction_FelclvrviuByirb + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_FelclvrviuByirb: + jmp .L_last_blocks_done_jcdFbiukBEavFGE +.L_last_num_blocks_is_9_jcdFbiukBEavFGE: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $247,%r15d + jae .L_16_blocks_overflow_AplsctBswkCkEgg + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %xmm27,%xmm3,%xmm4 + jmp .L_16_blocks_ok_AplsctBswkCkEgg + +.L_16_blocks_overflow_AplsctBswkCkEgg: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %xmm29,%xmm4,%xmm4 +.L_16_blocks_ok_AplsctBswkCkEgg: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %xmm30,%xmm4,%xmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %xmm20,%xmm4,%xmm4 + vextracti32x4 $0,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %xmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %xmm29,%xmm4,%xmm20 + vextracti32x4 $0,%zmm20,%xmm7 + subq $16 * (9 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_jtFtADjqFyogvlv + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_jtFtADjqFyogvlv +.L_small_initial_partial_block_jtFtADjqFyogvlv: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_jtFtADjqFyogvlv: + + orq %r8,%r8 + je .L_after_reduction_jtFtADjqFyogvlv + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_jtFtADjqFyogvlv: + jmp .L_last_blocks_done_jcdFbiukBEavFGE +.L_last_num_blocks_is_10_jcdFbiukBEavFGE: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $246,%r15d + jae .L_16_blocks_overflow_sGofikfdvCsyufv + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %ymm27,%ymm3,%ymm4 + jmp .L_16_blocks_ok_sGofikfdvCsyufv + +.L_16_blocks_overflow_sGofikfdvCsyufv: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %ymm29,%ymm4,%ymm4 +.L_16_blocks_ok_sGofikfdvCsyufv: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %ymm30,%ymm4,%ymm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %ymm20,%ymm4,%ymm4 + vextracti32x4 $1,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %ymm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %ymm29,%ymm4,%ymm20 + vextracti32x4 $1,%zmm20,%xmm7 + subq $16 * (10 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_tcfdrpyrpqxjGcq + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_tcfdrpyrpqxjGcq +.L_small_initial_partial_block_tcfdrpyrpqxjGcq: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_tcfdrpyrpqxjGcq: + + orq %r8,%r8 + je .L_after_reduction_tcfdrpyrpqxjGcq + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_tcfdrpyrpqxjGcq: + jmp .L_last_blocks_done_jcdFbiukBEavFGE +.L_last_num_blocks_is_11_jcdFbiukBEavFGE: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $245,%r15d + jae .L_16_blocks_overflow_toAwkfvytGCcuzd + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_toAwkfvytGCcuzd + +.L_16_blocks_overflow_toAwkfvytGCcuzd: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_toAwkfvytGCcuzd: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $2,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vextracti32x4 $2,%zmm20,%xmm7 + subq $16 * (11 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_wlcDxsmFdsaDbFp + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_wlcDxsmFdsaDbFp +.L_small_initial_partial_block_wlcDxsmFdsaDbFp: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_wlcDxsmFdsaDbFp: + + orq %r8,%r8 + je .L_after_reduction_wlcDxsmFdsaDbFp + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_wlcDxsmFdsaDbFp: + jmp .L_last_blocks_done_jcdFbiukBEavFGE +.L_last_num_blocks_is_12_jcdFbiukBEavFGE: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $244,%r15d + jae .L_16_blocks_overflow_teGFdCBFbFbgpyu + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_teGFdCBFbFbgpyu + +.L_16_blocks_overflow_teGFdCBFbFbgpyu: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_teGFdCBFbFbgpyu: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $3,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vextracti32x4 $3,%zmm20,%xmm7 + subq $16 * (12 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_hapodhDjogGiCkb + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_hapodhDjogGiCkb +.L_small_initial_partial_block_hapodhDjogGiCkb: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_hapodhDjogGiCkb: + + orq %r8,%r8 + je .L_after_reduction_hapodhDjogGiCkb + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_hapodhDjogGiCkb: + jmp .L_last_blocks_done_jcdFbiukBEavFGE +.L_last_num_blocks_is_13_jcdFbiukBEavFGE: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $243,%r15d + jae .L_16_blocks_overflow_EcrGhzkACEdjiEA + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %xmm27,%xmm4,%xmm5 + jmp .L_16_blocks_ok_EcrGhzkACEdjiEA + +.L_16_blocks_overflow_EcrGhzkACEdjiEA: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %xmm29,%xmm5,%xmm5 +.L_16_blocks_ok_EcrGhzkACEdjiEA: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %xmm30,%xmm5,%xmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %xmm21,%xmm5,%xmm5 + vextracti32x4 $0,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %xmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %xmm29,%xmm5,%xmm21 + vextracti32x4 $0,%zmm21,%xmm7 + subq $16 * (13 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_lgpADhokDilDmjB + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_lgpADhokDilDmjB +.L_small_initial_partial_block_lgpADhokDilDmjB: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_lgpADhokDilDmjB: + + orq %r8,%r8 + je .L_after_reduction_lgpADhokDilDmjB + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_lgpADhokDilDmjB: + jmp .L_last_blocks_done_jcdFbiukBEavFGE +.L_last_num_blocks_is_14_jcdFbiukBEavFGE: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $242,%r15d + jae .L_16_blocks_overflow_vfAlEigAGAFFgAm + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %ymm27,%ymm4,%ymm5 + jmp .L_16_blocks_ok_vfAlEigAGAFFgAm + +.L_16_blocks_overflow_vfAlEigAGAFFgAm: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %ymm29,%ymm5,%ymm5 +.L_16_blocks_ok_vfAlEigAGAFFgAm: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %ymm30,%ymm5,%ymm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %ymm21,%ymm5,%ymm5 + vextracti32x4 $1,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %ymm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %ymm29,%ymm5,%ymm21 + vextracti32x4 $1,%zmm21,%xmm7 + subq $16 * (14 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_jvziCnlsAiEavam + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_jvziCnlsAiEavam +.L_small_initial_partial_block_jvziCnlsAiEavam: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_jvziCnlsAiEavam: + + orq %r8,%r8 + je .L_after_reduction_jvziCnlsAiEavam + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_jvziCnlsAiEavam: + jmp .L_last_blocks_done_jcdFbiukBEavFGE +.L_last_num_blocks_is_15_jcdFbiukBEavFGE: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $241,%r15d + jae .L_16_blocks_overflow_vDsgChtGCDEtEvr + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_vDsgChtGCDEtEvr + +.L_16_blocks_overflow_vDsgChtGCDEtEvr: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_vDsgChtGCDEtEvr: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $2,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %zmm29,%zmm5,%zmm21 + vextracti32x4 $2,%zmm21,%xmm7 + subq $16 * (15 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_aaoEnbdnBGewaEG + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_aaoEnbdnBGewaEG +.L_small_initial_partial_block_aaoEnbdnBGewaEG: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_aaoEnbdnBGewaEG: + + orq %r8,%r8 + je .L_after_reduction_aaoEnbdnBGewaEG + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_aaoEnbdnBGewaEG: + jmp .L_last_blocks_done_jcdFbiukBEavFGE +.L_last_num_blocks_is_16_jcdFbiukBEavFGE: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $240,%r15d + jae .L_16_blocks_overflow_rGdvngzaeGtrlsf + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_rGdvngzaeGtrlsf + +.L_16_blocks_overflow_rGdvngzaeGtrlsf: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_rGdvngzaeGtrlsf: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $3,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %zmm29,%zmm5,%zmm21 + vextracti32x4 $3,%zmm21,%xmm7 + subq $16 * (16 - 1),%r8 +.L_small_initial_partial_block_llADlmtFjlEejxe: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_llADlmtFjlEejxe: + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_llADlmtFjlEejxe: + jmp .L_last_blocks_done_jcdFbiukBEavFGE +.L_last_num_blocks_is_0_jcdFbiukBEavFGE: + vmovdqa64 768(%rsp),%zmm13 + vpxorq %zmm14,%zmm13,%zmm13 + vmovdqu64 0(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 832(%rsp),%zmm13 + vmovdqu64 64(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpxorq %zmm10,%zmm4,%zmm26 + vpxorq %zmm6,%zmm0,%zmm24 + vpxorq %zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 896(%rsp),%zmm13 + vmovdqu64 128(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 960(%rsp),%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm4 + + + vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0 + vpslldq $8,%xmm0,%xmm0 + vpxorq %xmm0,%xmm25,%xmm0 + + + vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3 + vpsrldq $4,%xmm3,%xmm3 + vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm3,%xmm14 + +.L_last_blocks_done_jcdFbiukBEavFGE: + vpshufb %xmm29,%xmm2,%xmm2 + jmp .L_ghash_done_tFbkipsuzBAeEGF +.L_encrypt_16_blocks_tFbkipsuzBAeEGF: + cmpb $240,%r15b + jae .L_16_blocks_overflow_AfdGcFddyowgCfD + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_AfdGcFddyowgCfD +.L_16_blocks_overflow_AfdGcFddyowgCfD: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_AfdGcFddyowgCfD: + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp),%zmm1 + + + + + vshufi64x2 $255,%zmm5,%zmm5,%zmm2 + addb $16,%r15b + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + + + + + + + + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm6 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + + + + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm15,%zmm10,%zmm26 + vpxorq %zmm12,%zmm6,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + + + + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + + + + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1) + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 + vmovdqa64 %zmm0,1280(%rsp) + vmovdqa64 %zmm3,1344(%rsp) + vmovdqa64 %zmm4,1408(%rsp) + vmovdqa64 %zmm5,1472(%rsp) + vmovdqa64 1024(%rsp),%zmm13 + vmovdqu64 256(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1088(%rsp),%zmm13 + vmovdqu64 320(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 1152(%rsp),%zmm13 + vmovdqu64 384(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1216(%rsp),%zmm13 + vmovdqu64 448(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + subq $256,%r8 + addq $256,%r11 + movl %r8d,%r10d + addl $15,%r10d + shrl $4,%r10d + je .L_last_num_blocks_is_0_DkxrwjzcAFtwGmv + + cmpl $8,%r10d + je .L_last_num_blocks_is_8_DkxrwjzcAFtwGmv + jb .L_last_num_blocks_is_7_1_DkxrwjzcAFtwGmv + + + cmpl $12,%r10d + je .L_last_num_blocks_is_12_DkxrwjzcAFtwGmv + jb .L_last_num_blocks_is_11_9_DkxrwjzcAFtwGmv + + + cmpl $15,%r10d + je .L_last_num_blocks_is_15_DkxrwjzcAFtwGmv + ja .L_last_num_blocks_is_16_DkxrwjzcAFtwGmv + cmpl $14,%r10d + je .L_last_num_blocks_is_14_DkxrwjzcAFtwGmv + jmp .L_last_num_blocks_is_13_DkxrwjzcAFtwGmv + +.L_last_num_blocks_is_11_9_DkxrwjzcAFtwGmv: + + cmpl $10,%r10d + je .L_last_num_blocks_is_10_DkxrwjzcAFtwGmv + ja .L_last_num_blocks_is_11_DkxrwjzcAFtwGmv + jmp .L_last_num_blocks_is_9_DkxrwjzcAFtwGmv + +.L_last_num_blocks_is_7_1_DkxrwjzcAFtwGmv: + cmpl $4,%r10d + je .L_last_num_blocks_is_4_DkxrwjzcAFtwGmv + jb .L_last_num_blocks_is_3_1_DkxrwjzcAFtwGmv + + cmpl $6,%r10d + ja .L_last_num_blocks_is_7_DkxrwjzcAFtwGmv + je .L_last_num_blocks_is_6_DkxrwjzcAFtwGmv + jmp .L_last_num_blocks_is_5_DkxrwjzcAFtwGmv + +.L_last_num_blocks_is_3_1_DkxrwjzcAFtwGmv: + + cmpl $2,%r10d + ja .L_last_num_blocks_is_3_DkxrwjzcAFtwGmv + je .L_last_num_blocks_is_2_DkxrwjzcAFtwGmv +.L_last_num_blocks_is_1_DkxrwjzcAFtwGmv: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $255,%r15d + jae .L_16_blocks_overflow_AeBdutzBBGkrhww + vpaddd %xmm28,%xmm2,%xmm0 + jmp .L_16_blocks_ok_AeBdutzBBGkrhww + +.L_16_blocks_overflow_AeBdutzBBGkrhww: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %xmm29,%xmm0,%xmm0 +.L_16_blocks_ok_AeBdutzBBGkrhww: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $0,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z} + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %xmm31,%xmm0,%xmm0 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %xmm30,%xmm0,%xmm0 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti32x4 $0,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %xmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %xmm29,%xmm0,%xmm17 + vextracti32x4 $0,%zmm17,%xmm7 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_sanDChDEAsbDbDy + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_sanDChDEAsbDbDy +.L_small_initial_partial_block_sanDChDEAsbDbDy: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + + + + + + + + + + + + vpxorq %xmm7,%xmm14,%xmm14 + + jmp .L_after_reduction_sanDChDEAsbDbDy +.L_small_initial_compute_done_sanDChDEAsbDbDy: +.L_after_reduction_sanDChDEAsbDbDy: + jmp .L_last_blocks_done_DkxrwjzcAFtwGmv +.L_last_num_blocks_is_2_DkxrwjzcAFtwGmv: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $254,%r15d + jae .L_16_blocks_overflow_zEobAyflaqodkxt + vpaddd %ymm28,%ymm2,%ymm0 + jmp .L_16_blocks_ok_zEobAyflaqodkxt + +.L_16_blocks_overflow_zEobAyflaqodkxt: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %ymm29,%ymm0,%ymm0 +.L_16_blocks_ok_zEobAyflaqodkxt: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $1,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z} + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %ymm31,%ymm0,%ymm0 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %ymm30,%ymm0,%ymm0 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %ymm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %ymm29,%ymm0,%ymm17 + vextracti32x4 $1,%zmm17,%xmm7 + subq $16 * (2 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_btzmvhkGEADbAkx + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_btzmvhkGEADbAkx +.L_small_initial_partial_block_btzmvhkGEADbAkx: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_btzmvhkGEADbAkx: + + orq %r8,%r8 + je .L_after_reduction_btzmvhkGEADbAkx + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_btzmvhkGEADbAkx: + jmp .L_last_blocks_done_DkxrwjzcAFtwGmv +.L_last_num_blocks_is_3_DkxrwjzcAFtwGmv: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $253,%r15d + jae .L_16_blocks_overflow_gcfAxoFzqodzGEz + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_gcfAxoFzqodzGEz + +.L_16_blocks_overflow_gcfAxoFzqodzGEz: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_gcfAxoFzqodzGEz: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $2,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $2,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vextracti32x4 $2,%zmm17,%xmm7 + subq $16 * (3 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_EasBgBicpEglkiw + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_EasBgBicpEglkiw +.L_small_initial_partial_block_EasBgBicpEglkiw: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_EasBgBicpEglkiw: + + orq %r8,%r8 + je .L_after_reduction_EasBgBicpEglkiw + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_EasBgBicpEglkiw: + jmp .L_last_blocks_done_DkxrwjzcAFtwGmv +.L_last_num_blocks_is_4_DkxrwjzcAFtwGmv: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $252,%r15d + jae .L_16_blocks_overflow_manbGbfyvfFsrnl + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_manbGbfyvfFsrnl + +.L_16_blocks_overflow_manbGbfyvfFsrnl: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_manbGbfyvfFsrnl: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $3,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $3,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vextracti32x4 $3,%zmm17,%xmm7 + subq $16 * (4 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_kwtpvxfGBCymBsb + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_kwtpvxfGBCymBsb +.L_small_initial_partial_block_kwtpvxfGBCymBsb: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_kwtpvxfGBCymBsb: + + orq %r8,%r8 + je .L_after_reduction_kwtpvxfGBCymBsb + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_kwtpvxfGBCymBsb: + jmp .L_last_blocks_done_DkxrwjzcAFtwGmv +.L_last_num_blocks_is_5_DkxrwjzcAFtwGmv: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $251,%r15d + jae .L_16_blocks_overflow_fjElnuxjdEdFEct + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %xmm27,%xmm0,%xmm3 + jmp .L_16_blocks_ok_fjElnuxjdEdFEct + +.L_16_blocks_overflow_fjElnuxjdEdFEct: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %xmm29,%xmm3,%xmm3 +.L_16_blocks_ok_fjElnuxjdEdFEct: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $0,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %xmm30,%xmm3,%xmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %xmm19,%xmm3,%xmm3 + vextracti32x4 $0,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %xmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %xmm29,%xmm3,%xmm19 + vextracti32x4 $0,%zmm19,%xmm7 + subq $16 * (5 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_DbgCAmgvxscuoqv + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_DbgCAmgvxscuoqv +.L_small_initial_partial_block_DbgCAmgvxscuoqv: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_DbgCAmgvxscuoqv: + + orq %r8,%r8 + je .L_after_reduction_DbgCAmgvxscuoqv + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_DbgCAmgvxscuoqv: + jmp .L_last_blocks_done_DkxrwjzcAFtwGmv +.L_last_num_blocks_is_6_DkxrwjzcAFtwGmv: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $250,%r15d + jae .L_16_blocks_overflow_tfrvDdzahijbwmB + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %ymm27,%ymm0,%ymm3 + jmp .L_16_blocks_ok_tfrvDdzahijbwmB + +.L_16_blocks_overflow_tfrvDdzahijbwmB: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %ymm29,%ymm3,%ymm3 +.L_16_blocks_ok_tfrvDdzahijbwmB: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $1,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %ymm30,%ymm3,%ymm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %ymm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %ymm29,%ymm3,%ymm19 + vextracti32x4 $1,%zmm19,%xmm7 + subq $16 * (6 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_uEnwhzkdGwAplec + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_uEnwhzkdGwAplec +.L_small_initial_partial_block_uEnwhzkdGwAplec: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_uEnwhzkdGwAplec: + + orq %r8,%r8 + je .L_after_reduction_uEnwhzkdGwAplec + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_uEnwhzkdGwAplec: + jmp .L_last_blocks_done_DkxrwjzcAFtwGmv +.L_last_num_blocks_is_7_DkxrwjzcAFtwGmv: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $249,%r15d + jae .L_16_blocks_overflow_qidtflFxFddzhgg + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_qidtflFxFddzhgg + +.L_16_blocks_overflow_qidtflFxFddzhgg: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_qidtflFxFddzhgg: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $2,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $2,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vextracti32x4 $2,%zmm19,%xmm7 + subq $16 * (7 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_qvicAgCgBiisxsr + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_qvicAgCgBiisxsr +.L_small_initial_partial_block_qvicAgCgBiisxsr: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_qvicAgCgBiisxsr: + + orq %r8,%r8 + je .L_after_reduction_qvicAgCgBiisxsr + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_qvicAgCgBiisxsr: + jmp .L_last_blocks_done_DkxrwjzcAFtwGmv +.L_last_num_blocks_is_8_DkxrwjzcAFtwGmv: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $248,%r15d + jae .L_16_blocks_overflow_luzsesiwggypeey + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_luzsesiwggypeey + +.L_16_blocks_overflow_luzsesiwggypeey: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_luzsesiwggypeey: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $3,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $3,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vextracti32x4 $3,%zmm19,%xmm7 + subq $16 * (8 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_dhgyBxajscbfima + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_dhgyBxajscbfima +.L_small_initial_partial_block_dhgyBxajscbfima: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_dhgyBxajscbfima: + + orq %r8,%r8 + je .L_after_reduction_dhgyBxajscbfima + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_dhgyBxajscbfima: + jmp .L_last_blocks_done_DkxrwjzcAFtwGmv +.L_last_num_blocks_is_9_DkxrwjzcAFtwGmv: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $247,%r15d + jae .L_16_blocks_overflow_EkueqaGdhDjCdgp + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %xmm27,%xmm3,%xmm4 + jmp .L_16_blocks_ok_EkueqaGdhDjCdgp + +.L_16_blocks_overflow_EkueqaGdhDjCdgp: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %xmm29,%xmm4,%xmm4 +.L_16_blocks_ok_EkueqaGdhDjCdgp: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $0,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %xmm30,%xmm4,%xmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %xmm20,%xmm4,%xmm4 + vextracti32x4 $0,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %xmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %xmm29,%xmm4,%xmm20 + vextracti32x4 $0,%zmm20,%xmm7 + subq $16 * (9 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_heqAoqbbuAkcyrx + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_heqAoqbbuAkcyrx +.L_small_initial_partial_block_heqAoqbbuAkcyrx: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_heqAoqbbuAkcyrx: + + orq %r8,%r8 + je .L_after_reduction_heqAoqbbuAkcyrx + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_heqAoqbbuAkcyrx: + jmp .L_last_blocks_done_DkxrwjzcAFtwGmv +.L_last_num_blocks_is_10_DkxrwjzcAFtwGmv: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $246,%r15d + jae .L_16_blocks_overflow_wvgCfboudsrmujp + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %ymm27,%ymm3,%ymm4 + jmp .L_16_blocks_ok_wvgCfboudsrmujp + +.L_16_blocks_overflow_wvgCfboudsrmujp: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %ymm29,%ymm4,%ymm4 +.L_16_blocks_ok_wvgCfboudsrmujp: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $1,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %ymm30,%ymm4,%ymm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %ymm20,%ymm4,%ymm4 + vextracti32x4 $1,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %ymm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %ymm29,%ymm4,%ymm20 + vextracti32x4 $1,%zmm20,%xmm7 + subq $16 * (10 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_yxeqEqghwAplnqh + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_yxeqEqghwAplnqh +.L_small_initial_partial_block_yxeqEqghwAplnqh: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_yxeqEqghwAplnqh: + + orq %r8,%r8 + je .L_after_reduction_yxeqEqghwAplnqh + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_yxeqEqghwAplnqh: + jmp .L_last_blocks_done_DkxrwjzcAFtwGmv +.L_last_num_blocks_is_11_DkxrwjzcAFtwGmv: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $245,%r15d + jae .L_16_blocks_overflow_cwemdvzqaqrBmvF + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_cwemdvzqaqrBmvF + +.L_16_blocks_overflow_cwemdvzqaqrBmvF: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_cwemdvzqaqrBmvF: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $2,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $2,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vextracti32x4 $2,%zmm20,%xmm7 + subq $16 * (11 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_tngolGfEmxmwAAg + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_tngolGfEmxmwAAg +.L_small_initial_partial_block_tngolGfEmxmwAAg: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_tngolGfEmxmwAAg: + + orq %r8,%r8 + je .L_after_reduction_tngolGfEmxmwAAg + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_tngolGfEmxmwAAg: + jmp .L_last_blocks_done_DkxrwjzcAFtwGmv +.L_last_num_blocks_is_12_DkxrwjzcAFtwGmv: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $244,%r15d + jae .L_16_blocks_overflow_viscCxhaitpgcDa + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_viscCxhaitpgcDa + +.L_16_blocks_overflow_viscCxhaitpgcDa: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_viscCxhaitpgcDa: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $3,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $3,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vextracti32x4 $3,%zmm20,%xmm7 + subq $16 * (12 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_AEGqAevCpluaCEe + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_AEGqAevCpluaCEe +.L_small_initial_partial_block_AEGqAevCpluaCEe: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_AEGqAevCpluaCEe: + + orq %r8,%r8 + je .L_after_reduction_AEGqAevCpluaCEe + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_AEGqAevCpluaCEe: + jmp .L_last_blocks_done_DkxrwjzcAFtwGmv +.L_last_num_blocks_is_13_DkxrwjzcAFtwGmv: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $243,%r15d + jae .L_16_blocks_overflow_aswqypGGFyocuvD + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %xmm27,%xmm4,%xmm5 + jmp .L_16_blocks_ok_aswqypGGFyocuvD + +.L_16_blocks_overflow_aswqypGGFyocuvD: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %xmm29,%xmm5,%xmm5 +.L_16_blocks_ok_aswqypGGFyocuvD: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $0,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %xmm30,%xmm5,%xmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %xmm21,%xmm5,%xmm5 + vextracti32x4 $0,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %xmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %xmm29,%xmm5,%xmm21 + vextracti32x4 $0,%zmm21,%xmm7 + subq $16 * (13 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_ddibpDBalvcbdjr + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_ddibpDBalvcbdjr +.L_small_initial_partial_block_ddibpDBalvcbdjr: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_ddibpDBalvcbdjr: + + orq %r8,%r8 + je .L_after_reduction_ddibpDBalvcbdjr + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_ddibpDBalvcbdjr: + jmp .L_last_blocks_done_DkxrwjzcAFtwGmv +.L_last_num_blocks_is_14_DkxrwjzcAFtwGmv: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $242,%r15d + jae .L_16_blocks_overflow_uDoedupEeCpfBar + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %ymm27,%ymm4,%ymm5 + jmp .L_16_blocks_ok_uDoedupEeCpfBar + +.L_16_blocks_overflow_uDoedupEeCpfBar: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %ymm29,%ymm5,%ymm5 +.L_16_blocks_ok_uDoedupEeCpfBar: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $1,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %ymm30,%ymm5,%ymm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %ymm21,%ymm5,%ymm5 + vextracti32x4 $1,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %ymm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %ymm29,%ymm5,%ymm21 + vextracti32x4 $1,%zmm21,%xmm7 + subq $16 * (14 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_AilxjDdBvvoizqE + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_AilxjDdBvvoizqE +.L_small_initial_partial_block_AilxjDdBvvoizqE: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_AilxjDdBvvoizqE: + + orq %r8,%r8 + je .L_after_reduction_AilxjDdBvvoizqE + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_AilxjDdBvvoizqE: + jmp .L_last_blocks_done_DkxrwjzcAFtwGmv +.L_last_num_blocks_is_15_DkxrwjzcAFtwGmv: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $241,%r15d + jae .L_16_blocks_overflow_qsiCcemvFCbgltw + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_qsiCcemvFCbgltw + +.L_16_blocks_overflow_qsiCcemvFCbgltw: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_qsiCcemvFCbgltw: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $2,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $2,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %zmm29,%zmm5,%zmm21 + vextracti32x4 $2,%zmm21,%xmm7 + subq $16 * (15 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_uvFingxredipaxs + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_uvFingxredipaxs +.L_small_initial_partial_block_uvFingxredipaxs: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_uvFingxredipaxs: + + orq %r8,%r8 + je .L_after_reduction_uvFingxredipaxs + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_uvFingxredipaxs: + jmp .L_last_blocks_done_DkxrwjzcAFtwGmv +.L_last_num_blocks_is_16_DkxrwjzcAFtwGmv: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $240,%r15d + jae .L_16_blocks_overflow_pAbgwDdgnghCfey + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_pAbgwDdgnghCfey + +.L_16_blocks_overflow_pAbgwDdgnghCfey: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_pAbgwDdgnghCfey: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $3,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $3,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %zmm29,%zmm5,%zmm21 + vextracti32x4 $3,%zmm21,%xmm7 + subq $16 * (16 - 1),%r8 +.L_small_initial_partial_block_fFkawEbFoBxjEyl: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_fFkawEbFoBxjEyl: + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_fFkawEbFoBxjEyl: + jmp .L_last_blocks_done_DkxrwjzcAFtwGmv +.L_last_num_blocks_is_0_DkxrwjzcAFtwGmv: + vmovdqa64 1280(%rsp),%zmm13 + vmovdqu64 512(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1344(%rsp),%zmm13 + vmovdqu64 576(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 1408(%rsp),%zmm13 + vmovdqu64 640(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1472(%rsp),%zmm13 + vmovdqu64 704(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm4 + + + vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0 + vpslldq $8,%xmm0,%xmm0 + vpxorq %xmm0,%xmm25,%xmm0 + + + vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3 + vpsrldq $4,%xmm3,%xmm3 + vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm3,%xmm14 + +.L_last_blocks_done_DkxrwjzcAFtwGmv: + vpshufb %xmm29,%xmm2,%xmm2 + jmp .L_ghash_done_tFbkipsuzBAeEGF + +.L_message_below_32_blocks_tFbkipsuzBAeEGF: + + + subq $256,%r8 + addq $256,%r11 + movl %r8d,%r10d + testq %r14,%r14 + jnz .L_skip_hkeys_precomputation_lpEjyDrFbrgBuyj + vmovdqu64 640(%rsp),%zmm3 + + + vshufi64x2 $0x00,%zmm3,%zmm3,%zmm3 + + vmovdqu64 576(%rsp),%zmm4 + vmovdqu64 512(%rsp),%zmm5 + + vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4 + vpxorq %zmm10,%zmm4,%zmm4 + + vpsrldq $8,%zmm4,%zmm10 + vpslldq $8,%zmm4,%zmm4 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4 + vpslldq $4,%zmm4,%zmm4 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm4 + + vmovdqu64 %zmm4,448(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5 + vpxorq %zmm10,%zmm5,%zmm5 + + vpsrldq $8,%zmm5,%zmm10 + vpslldq $8,%zmm5,%zmm5 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5 + vpslldq $4,%zmm5,%zmm5 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm5 + + vmovdqu64 %zmm5,384(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4 + vpxorq %zmm10,%zmm4,%zmm4 + + vpsrldq $8,%zmm4,%zmm10 + vpslldq $8,%zmm4,%zmm4 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4 + vpslldq $4,%zmm4,%zmm4 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm4 + + vmovdqu64 %zmm4,320(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5 + vpxorq %zmm10,%zmm5,%zmm5 + + vpsrldq $8,%zmm5,%zmm10 + vpslldq $8,%zmm5,%zmm5 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5 + vpslldq $4,%zmm5,%zmm5 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm5 + + vmovdqu64 %zmm5,256(%rsp) +.L_skip_hkeys_precomputation_lpEjyDrFbrgBuyj: + movq $1,%r14 + andl $~15,%r10d + movl $512,%ebx + subl %r10d,%ebx + movl %r8d,%r10d + addl $15,%r10d + shrl $4,%r10d + je .L_last_num_blocks_is_0_wmGtzaxjkAduAzk + + cmpl $8,%r10d + je .L_last_num_blocks_is_8_wmGtzaxjkAduAzk + jb .L_last_num_blocks_is_7_1_wmGtzaxjkAduAzk + + + cmpl $12,%r10d + je .L_last_num_blocks_is_12_wmGtzaxjkAduAzk + jb .L_last_num_blocks_is_11_9_wmGtzaxjkAduAzk + + + cmpl $15,%r10d + je .L_last_num_blocks_is_15_wmGtzaxjkAduAzk + ja .L_last_num_blocks_is_16_wmGtzaxjkAduAzk + cmpl $14,%r10d + je .L_last_num_blocks_is_14_wmGtzaxjkAduAzk + jmp .L_last_num_blocks_is_13_wmGtzaxjkAduAzk + +.L_last_num_blocks_is_11_9_wmGtzaxjkAduAzk: + + cmpl $10,%r10d + je .L_last_num_blocks_is_10_wmGtzaxjkAduAzk + ja .L_last_num_blocks_is_11_wmGtzaxjkAduAzk + jmp .L_last_num_blocks_is_9_wmGtzaxjkAduAzk + +.L_last_num_blocks_is_7_1_wmGtzaxjkAduAzk: + cmpl $4,%r10d + je .L_last_num_blocks_is_4_wmGtzaxjkAduAzk + jb .L_last_num_blocks_is_3_1_wmGtzaxjkAduAzk + + cmpl $6,%r10d + ja .L_last_num_blocks_is_7_wmGtzaxjkAduAzk + je .L_last_num_blocks_is_6_wmGtzaxjkAduAzk + jmp .L_last_num_blocks_is_5_wmGtzaxjkAduAzk + +.L_last_num_blocks_is_3_1_wmGtzaxjkAduAzk: + + cmpl $2,%r10d + ja .L_last_num_blocks_is_3_wmGtzaxjkAduAzk + je .L_last_num_blocks_is_2_wmGtzaxjkAduAzk +.L_last_num_blocks_is_1_wmGtzaxjkAduAzk: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $255,%r15d + jae .L_16_blocks_overflow_zAppBdlpFnqjcjn + vpaddd %xmm28,%xmm2,%xmm0 + jmp .L_16_blocks_ok_zAppBdlpFnqjcjn + +.L_16_blocks_overflow_zAppBdlpFnqjcjn: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %xmm29,%xmm0,%xmm0 +.L_16_blocks_ok_zAppBdlpFnqjcjn: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z} + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vaesenclast %xmm30,%xmm0,%xmm0 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti32x4 $0,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %xmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %xmm29,%xmm0,%xmm17 + vextracti32x4 $0,%zmm17,%xmm7 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_ohletviGGDnsqsh + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_ohletviGGDnsqsh +.L_small_initial_partial_block_ohletviGGDnsqsh: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm0 + + + vpclmulqdq $0x01,%xmm25,%xmm0,%xmm3 + vpslldq $8,%xmm3,%xmm3 + vpxorq %xmm3,%xmm25,%xmm3 + + + vpclmulqdq $0x00,%xmm3,%xmm0,%xmm4 + vpsrldq $4,%xmm4,%xmm4 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm4,%xmm14 + + + + + + + + + + + + + vpxorq %xmm7,%xmm14,%xmm14 + + jmp .L_after_reduction_ohletviGGDnsqsh +.L_small_initial_compute_done_ohletviGGDnsqsh: +.L_after_reduction_ohletviGGDnsqsh: + jmp .L_last_blocks_done_wmGtzaxjkAduAzk +.L_last_num_blocks_is_2_wmGtzaxjkAduAzk: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $254,%r15d + jae .L_16_blocks_overflow_bApGhpvksEbgnlq + vpaddd %ymm28,%ymm2,%ymm0 + jmp .L_16_blocks_ok_bApGhpvksEbgnlq + +.L_16_blocks_overflow_bApGhpvksEbgnlq: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %ymm29,%ymm0,%ymm0 +.L_16_blocks_ok_bApGhpvksEbgnlq: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z} + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vaesenclast %ymm30,%ymm0,%ymm0 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %ymm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %ymm29,%ymm0,%ymm17 + vextracti32x4 $1,%zmm17,%xmm7 + subq $16 * (2 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_atfqpoawbrCaGCo + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_atfqpoawbrCaGCo +.L_small_initial_partial_block_atfqpoawbrCaGCo: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_atfqpoawbrCaGCo: + + orq %r8,%r8 + je .L_after_reduction_atfqpoawbrCaGCo + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_atfqpoawbrCaGCo: + jmp .L_last_blocks_done_wmGtzaxjkAduAzk +.L_last_num_blocks_is_3_wmGtzaxjkAduAzk: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $253,%r15d + jae .L_16_blocks_overflow_ngmcavmrDqtqduc + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_ngmcavmrDqtqduc + +.L_16_blocks_overflow_ngmcavmrDqtqduc: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_ngmcavmrDqtqduc: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $2,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vextracti32x4 $2,%zmm17,%xmm7 + subq $16 * (3 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_EgjBqgvkBgauzsF + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_EgjBqgvkBgauzsF +.L_small_initial_partial_block_EgjBqgvkBgauzsF: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_EgjBqgvkBgauzsF: + + orq %r8,%r8 + je .L_after_reduction_EgjBqgvkBgauzsF + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_EgjBqgvkBgauzsF: + jmp .L_last_blocks_done_wmGtzaxjkAduAzk +.L_last_num_blocks_is_4_wmGtzaxjkAduAzk: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $252,%r15d + jae .L_16_blocks_overflow_oDoDxdeeEEpoaof + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_oDoDxdeeEEpoaof + +.L_16_blocks_overflow_oDoDxdeeEEpoaof: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_oDoDxdeeEEpoaof: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $3,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vextracti32x4 $3,%zmm17,%xmm7 + subq $16 * (4 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_akFyBqpssGEhllv + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_akFyBqpssGEhllv +.L_small_initial_partial_block_akFyBqpssGEhllv: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_akFyBqpssGEhllv: + + orq %r8,%r8 + je .L_after_reduction_akFyBqpssGEhllv + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_akFyBqpssGEhllv: + jmp .L_last_blocks_done_wmGtzaxjkAduAzk +.L_last_num_blocks_is_5_wmGtzaxjkAduAzk: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $251,%r15d + jae .L_16_blocks_overflow_vwvElrjpjpxAvis + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %xmm27,%xmm0,%xmm3 + jmp .L_16_blocks_ok_vwvElrjpjpxAvis + +.L_16_blocks_overflow_vwvElrjpjpxAvis: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %xmm29,%xmm3,%xmm3 +.L_16_blocks_ok_vwvElrjpjpxAvis: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %xmm30,%xmm3,%xmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %xmm19,%xmm3,%xmm3 + vextracti32x4 $0,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %xmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %xmm29,%xmm3,%xmm19 + vextracti32x4 $0,%zmm19,%xmm7 + subq $16 * (5 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_DFFzfAbyBGFnoDn + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_DFFzfAbyBGFnoDn +.L_small_initial_partial_block_DFFzfAbyBGFnoDn: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_DFFzfAbyBGFnoDn: + + orq %r8,%r8 + je .L_after_reduction_DFFzfAbyBGFnoDn + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_DFFzfAbyBGFnoDn: + jmp .L_last_blocks_done_wmGtzaxjkAduAzk +.L_last_num_blocks_is_6_wmGtzaxjkAduAzk: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $250,%r15d + jae .L_16_blocks_overflow_vyDvhDFpixkDdnk + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %ymm27,%ymm0,%ymm3 + jmp .L_16_blocks_ok_vyDvhDFpixkDdnk + +.L_16_blocks_overflow_vyDvhDFpixkDdnk: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %ymm29,%ymm3,%ymm3 +.L_16_blocks_ok_vyDvhDFpixkDdnk: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %ymm30,%ymm3,%ymm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %ymm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %ymm29,%ymm3,%ymm19 + vextracti32x4 $1,%zmm19,%xmm7 + subq $16 * (6 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_FEocggExrFlAoic + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_FEocggExrFlAoic +.L_small_initial_partial_block_FEocggExrFlAoic: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_FEocggExrFlAoic: + + orq %r8,%r8 + je .L_after_reduction_FEocggExrFlAoic + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_FEocggExrFlAoic: + jmp .L_last_blocks_done_wmGtzaxjkAduAzk +.L_last_num_blocks_is_7_wmGtzaxjkAduAzk: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $249,%r15d + jae .L_16_blocks_overflow_fvtxctukrBFoshm + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_fvtxctukrBFoshm + +.L_16_blocks_overflow_fvtxctukrBFoshm: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_fvtxctukrBFoshm: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $2,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vextracti32x4 $2,%zmm19,%xmm7 + subq $16 * (7 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_zsgnBgnADqqaFdG + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_zsgnBgnADqqaFdG +.L_small_initial_partial_block_zsgnBgnADqqaFdG: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_zsgnBgnADqqaFdG: + + orq %r8,%r8 + je .L_after_reduction_zsgnBgnADqqaFdG + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_zsgnBgnADqqaFdG: + jmp .L_last_blocks_done_wmGtzaxjkAduAzk +.L_last_num_blocks_is_8_wmGtzaxjkAduAzk: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $248,%r15d + jae .L_16_blocks_overflow_ACyFnxEijEcdofC + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_ACyFnxEijEcdofC + +.L_16_blocks_overflow_ACyFnxEijEcdofC: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_ACyFnxEijEcdofC: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $3,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vextracti32x4 $3,%zmm19,%xmm7 + subq $16 * (8 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_pinsyEqvsAdoiak + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_pinsyEqvsAdoiak +.L_small_initial_partial_block_pinsyEqvsAdoiak: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_pinsyEqvsAdoiak: + + orq %r8,%r8 + je .L_after_reduction_pinsyEqvsAdoiak + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_pinsyEqvsAdoiak: + jmp .L_last_blocks_done_wmGtzaxjkAduAzk +.L_last_num_blocks_is_9_wmGtzaxjkAduAzk: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $247,%r15d + jae .L_16_blocks_overflow_AhlgEzovddtvDon + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %xmm27,%xmm3,%xmm4 + jmp .L_16_blocks_ok_AhlgEzovddtvDon + +.L_16_blocks_overflow_AhlgEzovddtvDon: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %xmm29,%xmm4,%xmm4 +.L_16_blocks_ok_AhlgEzovddtvDon: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %xmm30,%xmm4,%xmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %xmm20,%xmm4,%xmm4 + vextracti32x4 $0,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %xmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %xmm29,%xmm4,%xmm20 + vextracti32x4 $0,%zmm20,%xmm7 + subq $16 * (9 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_dgkfebGqcuDCjgt + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_dgkfebGqcuDCjgt +.L_small_initial_partial_block_dgkfebGqcuDCjgt: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_dgkfebGqcuDCjgt: + + orq %r8,%r8 + je .L_after_reduction_dgkfebGqcuDCjgt + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_dgkfebGqcuDCjgt: + jmp .L_last_blocks_done_wmGtzaxjkAduAzk +.L_last_num_blocks_is_10_wmGtzaxjkAduAzk: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $246,%r15d + jae .L_16_blocks_overflow_AcoEnlwuyyjhDuq + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %ymm27,%ymm3,%ymm4 + jmp .L_16_blocks_ok_AcoEnlwuyyjhDuq + +.L_16_blocks_overflow_AcoEnlwuyyjhDuq: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %ymm29,%ymm4,%ymm4 +.L_16_blocks_ok_AcoEnlwuyyjhDuq: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %ymm30,%ymm4,%ymm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %ymm20,%ymm4,%ymm4 + vextracti32x4 $1,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %ymm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %ymm29,%ymm4,%ymm20 + vextracti32x4 $1,%zmm20,%xmm7 + subq $16 * (10 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_upsmGyaxeoyuGwq + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_upsmGyaxeoyuGwq +.L_small_initial_partial_block_upsmGyaxeoyuGwq: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_upsmGyaxeoyuGwq: + + orq %r8,%r8 + je .L_after_reduction_upsmGyaxeoyuGwq + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_upsmGyaxeoyuGwq: + jmp .L_last_blocks_done_wmGtzaxjkAduAzk +.L_last_num_blocks_is_11_wmGtzaxjkAduAzk: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $245,%r15d + jae .L_16_blocks_overflow_coDokyrbzujjnFG + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_coDokyrbzujjnFG + +.L_16_blocks_overflow_coDokyrbzujjnFG: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_coDokyrbzujjnFG: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $2,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vextracti32x4 $2,%zmm20,%xmm7 + subq $16 * (11 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_dtFFjiEElouyrlF + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_dtFFjiEElouyrlF +.L_small_initial_partial_block_dtFFjiEElouyrlF: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_dtFFjiEElouyrlF: + + orq %r8,%r8 + je .L_after_reduction_dtFFjiEElouyrlF + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_dtFFjiEElouyrlF: + jmp .L_last_blocks_done_wmGtzaxjkAduAzk +.L_last_num_blocks_is_12_wmGtzaxjkAduAzk: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $244,%r15d + jae .L_16_blocks_overflow_uvhijsplaEEmlke + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_uvhijsplaEEmlke + +.L_16_blocks_overflow_uvhijsplaEEmlke: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_uvhijsplaEEmlke: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $3,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vextracti32x4 $3,%zmm20,%xmm7 + subq $16 * (12 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_sArmCAuDwnDnahw + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_sArmCAuDwnDnahw +.L_small_initial_partial_block_sArmCAuDwnDnahw: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_sArmCAuDwnDnahw: + + orq %r8,%r8 + je .L_after_reduction_sArmCAuDwnDnahw + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_sArmCAuDwnDnahw: + jmp .L_last_blocks_done_wmGtzaxjkAduAzk +.L_last_num_blocks_is_13_wmGtzaxjkAduAzk: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $243,%r15d + jae .L_16_blocks_overflow_dCqAGwyhtFDDhuf + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %xmm27,%xmm4,%xmm5 + jmp .L_16_blocks_ok_dCqAGwyhtFDDhuf + +.L_16_blocks_overflow_dCqAGwyhtFDDhuf: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %xmm29,%xmm5,%xmm5 +.L_16_blocks_ok_dCqAGwyhtFDDhuf: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %xmm30,%xmm5,%xmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %xmm21,%xmm5,%xmm5 + vextracti32x4 $0,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %xmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %xmm29,%xmm5,%xmm21 + vextracti32x4 $0,%zmm21,%xmm7 + subq $16 * (13 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_AoFriGggjmCqdFe + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_AoFriGggjmCqdFe +.L_small_initial_partial_block_AoFriGggjmCqdFe: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_AoFriGggjmCqdFe: + + orq %r8,%r8 + je .L_after_reduction_AoFriGggjmCqdFe + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_AoFriGggjmCqdFe: + jmp .L_last_blocks_done_wmGtzaxjkAduAzk +.L_last_num_blocks_is_14_wmGtzaxjkAduAzk: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $242,%r15d + jae .L_16_blocks_overflow_eymtigzEympdfbq + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %ymm27,%ymm4,%ymm5 + jmp .L_16_blocks_ok_eymtigzEympdfbq + +.L_16_blocks_overflow_eymtigzEympdfbq: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %ymm29,%ymm5,%ymm5 +.L_16_blocks_ok_eymtigzEympdfbq: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %ymm30,%ymm5,%ymm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %ymm21,%ymm5,%ymm5 + vextracti32x4 $1,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %ymm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %ymm29,%ymm5,%ymm21 + vextracti32x4 $1,%zmm21,%xmm7 + subq $16 * (14 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_psAhdEAgnjgwhnp + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_psAhdEAgnjgwhnp +.L_small_initial_partial_block_psAhdEAgnjgwhnp: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_psAhdEAgnjgwhnp: + + orq %r8,%r8 + je .L_after_reduction_psAhdEAgnjgwhnp + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_psAhdEAgnjgwhnp: + jmp .L_last_blocks_done_wmGtzaxjkAduAzk +.L_last_num_blocks_is_15_wmGtzaxjkAduAzk: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $241,%r15d + jae .L_16_blocks_overflow_qGavfpFFnvaCwAd + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_qGavfpFFnvaCwAd + +.L_16_blocks_overflow_qGavfpFFnvaCwAd: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_qGavfpFFnvaCwAd: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $2,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %zmm29,%zmm5,%zmm21 + vextracti32x4 $2,%zmm21,%xmm7 + subq $16 * (15 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_DBkpyuBbpopmDCv + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_DBkpyuBbpopmDCv +.L_small_initial_partial_block_DBkpyuBbpopmDCv: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_DBkpyuBbpopmDCv: + + orq %r8,%r8 + je .L_after_reduction_DBkpyuBbpopmDCv + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_DBkpyuBbpopmDCv: + jmp .L_last_blocks_done_wmGtzaxjkAduAzk +.L_last_num_blocks_is_16_wmGtzaxjkAduAzk: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $240,%r15d + jae .L_16_blocks_overflow_jfFqqEmsqrheBbh + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_jfFqqEmsqrheBbh + +.L_16_blocks_overflow_jfFqqEmsqrheBbh: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_jfFqqEmsqrheBbh: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $3,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %zmm29,%zmm5,%zmm21 + vextracti32x4 $3,%zmm21,%xmm7 + subq $16 * (16 - 1),%r8 +.L_small_initial_partial_block_CEafoEfoaioCrtB: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_CEafoEfoaioCrtB: + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_CEafoEfoaioCrtB: + jmp .L_last_blocks_done_wmGtzaxjkAduAzk +.L_last_num_blocks_is_0_wmGtzaxjkAduAzk: + vmovdqa64 768(%rsp),%zmm13 + vpxorq %zmm14,%zmm13,%zmm13 + vmovdqu64 0(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 832(%rsp),%zmm13 + vmovdqu64 64(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpxorq %zmm10,%zmm4,%zmm26 + vpxorq %zmm6,%zmm0,%zmm24 + vpxorq %zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 896(%rsp),%zmm13 + vmovdqu64 128(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 960(%rsp),%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm4 + + + vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0 + vpslldq $8,%xmm0,%xmm0 + vpxorq %xmm0,%xmm25,%xmm0 + + + vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3 + vpsrldq $4,%xmm3,%xmm3 + vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm3,%xmm14 + +.L_last_blocks_done_wmGtzaxjkAduAzk: + vpshufb %xmm29,%xmm2,%xmm2 + jmp .L_ghash_done_tFbkipsuzBAeEGF + +.L_message_below_equal_16_blocks_tFbkipsuzBAeEGF: + + + movl %r8d,%r12d + addl $15,%r12d + shrl $4,%r12d + cmpq $8,%r12 + je .L_small_initial_num_blocks_is_8_tpcppgjkDAAGbmz + jl .L_small_initial_num_blocks_is_7_1_tpcppgjkDAAGbmz + + + cmpq $12,%r12 + je .L_small_initial_num_blocks_is_12_tpcppgjkDAAGbmz + jl .L_small_initial_num_blocks_is_11_9_tpcppgjkDAAGbmz + + + cmpq $16,%r12 + je .L_small_initial_num_blocks_is_16_tpcppgjkDAAGbmz + cmpq $15,%r12 + je .L_small_initial_num_blocks_is_15_tpcppgjkDAAGbmz + cmpq $14,%r12 + je .L_small_initial_num_blocks_is_14_tpcppgjkDAAGbmz + jmp .L_small_initial_num_blocks_is_13_tpcppgjkDAAGbmz + +.L_small_initial_num_blocks_is_11_9_tpcppgjkDAAGbmz: + + cmpq $11,%r12 + je .L_small_initial_num_blocks_is_11_tpcppgjkDAAGbmz + cmpq $10,%r12 + je .L_small_initial_num_blocks_is_10_tpcppgjkDAAGbmz + jmp .L_small_initial_num_blocks_is_9_tpcppgjkDAAGbmz + +.L_small_initial_num_blocks_is_7_1_tpcppgjkDAAGbmz: + cmpq $4,%r12 + je .L_small_initial_num_blocks_is_4_tpcppgjkDAAGbmz + jl .L_small_initial_num_blocks_is_3_1_tpcppgjkDAAGbmz + + cmpq $7,%r12 + je .L_small_initial_num_blocks_is_7_tpcppgjkDAAGbmz + cmpq $6,%r12 + je .L_small_initial_num_blocks_is_6_tpcppgjkDAAGbmz + jmp .L_small_initial_num_blocks_is_5_tpcppgjkDAAGbmz + +.L_small_initial_num_blocks_is_3_1_tpcppgjkDAAGbmz: + + cmpq $3,%r12 + je .L_small_initial_num_blocks_is_3_tpcppgjkDAAGbmz + cmpq $2,%r12 + je .L_small_initial_num_blocks_is_2_tpcppgjkDAAGbmz + + + + + +.L_small_initial_num_blocks_is_1_tpcppgjkDAAGbmz: + vmovdqa64 SHUF_MASK(%rip),%xmm29 + vpaddd ONE(%rip),%xmm2,%xmm0 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $0,%zmm0,%xmm2 + vpshufb %xmm29,%xmm0,%xmm0 + vmovdqu8 0(%rcx,%r11,1),%xmm6{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenclast %xmm15,%xmm0,%xmm0 + vpxorq %xmm6,%xmm0,%xmm0 + vextracti32x4 $0,%zmm0,%xmm12 + movq %r9,%r10 + vmovdqu8 %xmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %xmm29,%xmm0,%xmm6 + vextracti32x4 $0,%zmm6,%xmm13 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_vkGpbehGialtrzj + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm6,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm6,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm6,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm6,%xmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_vkGpbehGialtrzj +.L_small_initial_partial_block_vkGpbehGialtrzj: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + + + + + + + + + + + + vpxorq %xmm13,%xmm14,%xmm14 + + jmp .L_after_reduction_vkGpbehGialtrzj +.L_small_initial_compute_done_vkGpbehGialtrzj: +.L_after_reduction_vkGpbehGialtrzj: + jmp .L_small_initial_blocks_encrypted_tpcppgjkDAAGbmz +.L_small_initial_num_blocks_is_2_tpcppgjkDAAGbmz: + vmovdqa64 SHUF_MASK(%rip),%ymm29 + vshufi64x2 $0,%ymm2,%ymm2,%ymm0 + vpaddd ddq_add_1234(%rip),%ymm0,%ymm0 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $1,%zmm0,%xmm2 + vpshufb %ymm29,%ymm0,%ymm0 + vmovdqu8 0(%rcx,%r11,1),%ymm6{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenclast %ymm15,%ymm0,%ymm0 + vpxorq %ymm6,%ymm0,%ymm0 + vextracti32x4 $1,%zmm0,%xmm12 + movq %r9,%r10 + vmovdqu8 %ymm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %ymm29,%ymm0,%ymm6 + vextracti32x4 $1,%zmm6,%xmm13 + subq $16 * (2 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_yrCuttqEucBxwFi + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm6,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm6,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm6,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm6,%ymm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_yrCuttqEucBxwFi +.L_small_initial_partial_block_yrCuttqEucBxwFi: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm6,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm6,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm6,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm6,%xmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_yrCuttqEucBxwFi: + + orq %r8,%r8 + je .L_after_reduction_yrCuttqEucBxwFi + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_yrCuttqEucBxwFi: + jmp .L_small_initial_blocks_encrypted_tpcppgjkDAAGbmz +.L_small_initial_num_blocks_is_3_tpcppgjkDAAGbmz: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $2,%zmm0,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vmovdqu8 0(%rcx,%r11,1),%zmm6{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vpxorq %zmm6,%zmm0,%zmm0 + vextracti32x4 $2,%zmm0,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vextracti32x4 $2,%zmm6,%xmm13 + subq $16 * (3 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_kgsCrgatEoGephk + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_kgsCrgatEoGephk +.L_small_initial_partial_block_kgsCrgatEoGephk: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm6,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm6,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm6,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm6,%ymm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_kgsCrgatEoGephk: + + orq %r8,%r8 + je .L_after_reduction_kgsCrgatEoGephk + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_kgsCrgatEoGephk: + jmp .L_small_initial_blocks_encrypted_tpcppgjkDAAGbmz +.L_small_initial_num_blocks_is_4_tpcppgjkDAAGbmz: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $3,%zmm0,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vmovdqu8 0(%rcx,%r11,1),%zmm6{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vpxorq %zmm6,%zmm0,%zmm0 + vextracti32x4 $3,%zmm0,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vextracti32x4 $3,%zmm6,%xmm13 + subq $16 * (4 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_flxrhfiogcrnqye + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 288(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + + vpxorq %zmm19,%zmm17,%zmm17 + vpsrldq $8,%zmm17,%zmm4 + vpslldq $8,%zmm17,%zmm5 + vpxorq %zmm4,%zmm15,%zmm0 + vpxorq %zmm5,%zmm16,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_flxrhfiogcrnqye +.L_small_initial_partial_block_flxrhfiogcrnqye: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_flxrhfiogcrnqye: + + orq %r8,%r8 + je .L_after_reduction_flxrhfiogcrnqye + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_flxrhfiogcrnqye: + jmp .L_small_initial_blocks_encrypted_tpcppgjkDAAGbmz +.L_small_initial_num_blocks_is_5_tpcppgjkDAAGbmz: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $64,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $0,%zmm3,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %xmm29,%xmm3,%xmm3 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%xmm7{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %xmm15,%xmm3,%xmm3 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %xmm7,%xmm3,%xmm3 + vextracti32x4 $0,%zmm3,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %xmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vpshufb %xmm29,%xmm3,%xmm7 + vextracti32x4 $0,%zmm7,%xmm13 + subq $16 * (5 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_gFzmwxijGDfbEEt + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 272(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm7,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm7,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm7,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm7,%xmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_gFzmwxijGDfbEEt +.L_small_initial_partial_block_gFzmwxijGDfbEEt: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 288(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + + vpxorq %zmm19,%zmm17,%zmm17 + vpsrldq $8,%zmm17,%zmm4 + vpslldq $8,%zmm17,%zmm5 + vpxorq %zmm4,%zmm15,%zmm0 + vpxorq %zmm5,%zmm16,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_gFzmwxijGDfbEEt: + + orq %r8,%r8 + je .L_after_reduction_gFzmwxijGDfbEEt + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_gFzmwxijGDfbEEt: + jmp .L_small_initial_blocks_encrypted_tpcppgjkDAAGbmz +.L_small_initial_num_blocks_is_6_tpcppgjkDAAGbmz: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $64,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $1,%zmm3,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %ymm29,%ymm3,%ymm3 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%ymm7{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %ymm15,%ymm3,%ymm3 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %ymm7,%ymm3,%ymm3 + vextracti32x4 $1,%zmm3,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %ymm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vpshufb %ymm29,%ymm3,%ymm7 + vextracti32x4 $1,%zmm7,%xmm13 + subq $16 * (6 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_ywvaiFFsGziikok + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 256(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm7,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm7,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm7,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm7,%ymm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_ywvaiFFsGziikok +.L_small_initial_partial_block_ywvaiFFsGziikok: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 272(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm7,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm7,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm7,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm7,%xmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_ywvaiFFsGziikok: + + orq %r8,%r8 + je .L_after_reduction_ywvaiFFsGziikok + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_ywvaiFFsGziikok: + jmp .L_small_initial_blocks_encrypted_tpcppgjkDAAGbmz +.L_small_initial_num_blocks_is_7_tpcppgjkDAAGbmz: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $64,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $2,%zmm3,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vextracti32x4 $2,%zmm3,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vpshufb %zmm29,%zmm3,%zmm7 + vextracti32x4 $2,%zmm7,%xmm13 + subq $16 * (7 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_vjjxFhBDbbgteCx + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 240(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_vjjxFhBDbbgteCx +.L_small_initial_partial_block_vjjxFhBDbbgteCx: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 256(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm7,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm7,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm7,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm7,%ymm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_vjjxFhBDbbgteCx: + + orq %r8,%r8 + je .L_after_reduction_vjjxFhBDbbgteCx + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_vjjxFhBDbbgteCx: + jmp .L_small_initial_blocks_encrypted_tpcppgjkDAAGbmz +.L_small_initial_num_blocks_is_8_tpcppgjkDAAGbmz: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $64,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $3,%zmm3,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vextracti32x4 $3,%zmm3,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vpshufb %zmm29,%zmm3,%zmm7 + vextracti32x4 $3,%zmm7,%xmm13 + subq $16 * (8 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_jvbFniEeBiBFBmv + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 224(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 288(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + + vpxorq %zmm19,%zmm17,%zmm17 + vpsrldq $8,%zmm17,%zmm4 + vpslldq $8,%zmm17,%zmm5 + vpxorq %zmm4,%zmm15,%zmm0 + vpxorq %zmm5,%zmm16,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_jvbFniEeBiBFBmv +.L_small_initial_partial_block_jvbFniEeBiBFBmv: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 240(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_jvbFniEeBiBFBmv: + + orq %r8,%r8 + je .L_after_reduction_jvbFniEeBiBFBmv + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_jvbFniEeBiBFBmv: + jmp .L_small_initial_blocks_encrypted_tpcppgjkDAAGbmz +.L_small_initial_num_blocks_is_9_tpcppgjkDAAGbmz: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $128,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $0,%zmm4,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %xmm29,%xmm4,%xmm4 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%xmm10{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %xmm15,%xmm4,%xmm4 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %xmm10,%xmm4,%xmm4 + vextracti32x4 $0,%zmm4,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %xmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vpshufb %zmm29,%zmm3,%zmm7 + vpshufb %xmm29,%xmm4,%xmm10 + vextracti32x4 $0,%zmm10,%xmm13 + subq $16 * (9 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_zyfCoCjsyFFnpwn + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 208(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 272(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm10,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm10,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm10,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm10,%xmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_zyfCoCjsyFFnpwn +.L_small_initial_partial_block_zyfCoCjsyFFnpwn: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 224(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 288(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + + vpxorq %zmm19,%zmm17,%zmm17 + vpsrldq $8,%zmm17,%zmm4 + vpslldq $8,%zmm17,%zmm5 + vpxorq %zmm4,%zmm15,%zmm0 + vpxorq %zmm5,%zmm16,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_zyfCoCjsyFFnpwn: + + orq %r8,%r8 + je .L_after_reduction_zyfCoCjsyFFnpwn + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_zyfCoCjsyFFnpwn: + jmp .L_small_initial_blocks_encrypted_tpcppgjkDAAGbmz +.L_small_initial_num_blocks_is_10_tpcppgjkDAAGbmz: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $128,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $1,%zmm4,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %ymm29,%ymm4,%ymm4 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%ymm10{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %ymm15,%ymm4,%ymm4 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %ymm10,%ymm4,%ymm4 + vextracti32x4 $1,%zmm4,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %ymm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vpshufb %zmm29,%zmm3,%zmm7 + vpshufb %ymm29,%ymm4,%ymm10 + vextracti32x4 $1,%zmm10,%xmm13 + subq $16 * (10 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_GlGwjupayCEmAmk + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 192(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 256(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm10,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm10,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm10,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm10,%ymm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_GlGwjupayCEmAmk +.L_small_initial_partial_block_GlGwjupayCEmAmk: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 208(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 272(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm10,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm10,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm10,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm10,%xmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_GlGwjupayCEmAmk: + + orq %r8,%r8 + je .L_after_reduction_GlGwjupayCEmAmk + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_GlGwjupayCEmAmk: + jmp .L_small_initial_blocks_encrypted_tpcppgjkDAAGbmz +.L_small_initial_num_blocks_is_11_tpcppgjkDAAGbmz: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $128,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $2,%zmm4,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%zmm10{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %zmm15,%zmm4,%zmm4 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %zmm10,%zmm4,%zmm4 + vextracti32x4 $2,%zmm4,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vpshufb %zmm29,%zmm3,%zmm7 + vpshufb %zmm29,%zmm4,%zmm10 + vextracti32x4 $2,%zmm10,%xmm13 + subq $16 * (11 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_AedaxoBdGfervsb + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 176(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 240(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_AedaxoBdGfervsb +.L_small_initial_partial_block_AedaxoBdGfervsb: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 192(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 256(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm10,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm10,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm10,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm10,%ymm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_AedaxoBdGfervsb: + + orq %r8,%r8 + je .L_after_reduction_AedaxoBdGfervsb + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_AedaxoBdGfervsb: + jmp .L_small_initial_blocks_encrypted_tpcppgjkDAAGbmz +.L_small_initial_num_blocks_is_12_tpcppgjkDAAGbmz: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $128,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $3,%zmm4,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%zmm10{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %zmm15,%zmm4,%zmm4 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %zmm10,%zmm4,%zmm4 + vextracti32x4 $3,%zmm4,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vpshufb %zmm29,%zmm3,%zmm7 + vpshufb %zmm29,%zmm4,%zmm10 + vextracti32x4 $3,%zmm10,%xmm13 + subq $16 * (12 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_zfkGparhhvDqahn + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 160(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 224(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 288(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + + vpxorq %zmm19,%zmm17,%zmm17 + vpsrldq $8,%zmm17,%zmm4 + vpslldq $8,%zmm17,%zmm5 + vpxorq %zmm4,%zmm15,%zmm0 + vpxorq %zmm5,%zmm16,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_zfkGparhhvDqahn +.L_small_initial_partial_block_zfkGparhhvDqahn: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 176(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 240(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_zfkGparhhvDqahn: + + orq %r8,%r8 + je .L_after_reduction_zfkGparhhvDqahn + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_zfkGparhhvDqahn: + jmp .L_small_initial_blocks_encrypted_tpcppgjkDAAGbmz +.L_small_initial_num_blocks_is_13_tpcppgjkDAAGbmz: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + vpaddd ddq_add_8888(%rip),%zmm3,%zmm5 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $192,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $0,%zmm5,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %xmm29,%xmm5,%xmm5 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%zmm10 + vmovdqu8 192(%rcx,%r11,1),%xmm11{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %zmm15,%zmm4,%zmm4 + vpxorq %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %zmm15,%zmm4,%zmm4 + vaesenclast %xmm15,%xmm5,%xmm5 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %zmm10,%zmm4,%zmm4 + vpxorq %xmm11,%xmm5,%xmm5 + vextracti32x4 $0,%zmm5,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %xmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vpshufb %zmm29,%zmm3,%zmm7 + vpshufb %zmm29,%zmm4,%zmm10 + vpshufb %xmm29,%xmm5,%xmm11 + vextracti32x4 $0,%zmm11,%xmm13 + subq $16 * (13 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_uDsrwxuwAvaluno + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 144(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 208(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 272(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm11,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm11,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm11,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm11,%xmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_uDsrwxuwAvaluno +.L_small_initial_partial_block_uDsrwxuwAvaluno: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 160(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 224(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 288(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + + vpxorq %zmm19,%zmm17,%zmm17 + vpsrldq $8,%zmm17,%zmm4 + vpslldq $8,%zmm17,%zmm5 + vpxorq %zmm4,%zmm15,%zmm0 + vpxorq %zmm5,%zmm16,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_uDsrwxuwAvaluno: + + orq %r8,%r8 + je .L_after_reduction_uDsrwxuwAvaluno + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_uDsrwxuwAvaluno: + jmp .L_small_initial_blocks_encrypted_tpcppgjkDAAGbmz +.L_small_initial_num_blocks_is_14_tpcppgjkDAAGbmz: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + vpaddd ddq_add_8888(%rip),%zmm3,%zmm5 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $192,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $1,%zmm5,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %ymm29,%ymm5,%ymm5 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%zmm10 + vmovdqu8 192(%rcx,%r11,1),%ymm11{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %zmm15,%zmm4,%zmm4 + vpxorq %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %zmm15,%zmm4,%zmm4 + vaesenclast %ymm15,%ymm5,%ymm5 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %zmm10,%zmm4,%zmm4 + vpxorq %ymm11,%ymm5,%ymm5 + vextracti32x4 $1,%zmm5,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %ymm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vpshufb %zmm29,%zmm3,%zmm7 + vpshufb %zmm29,%zmm4,%zmm10 + vpshufb %ymm29,%ymm5,%ymm11 + vextracti32x4 $1,%zmm11,%xmm13 + subq $16 * (14 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_awnsCplrcfgEbDA + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 128(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 192(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 256(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm11,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm11,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm11,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm11,%ymm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_awnsCplrcfgEbDA +.L_small_initial_partial_block_awnsCplrcfgEbDA: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 144(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 208(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 272(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm11,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm11,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm11,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm11,%xmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_awnsCplrcfgEbDA: + + orq %r8,%r8 + je .L_after_reduction_awnsCplrcfgEbDA + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_awnsCplrcfgEbDA: + jmp .L_small_initial_blocks_encrypted_tpcppgjkDAAGbmz +.L_small_initial_num_blocks_is_15_tpcppgjkDAAGbmz: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + vpaddd ddq_add_8888(%rip),%zmm3,%zmm5 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $192,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $2,%zmm5,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%zmm10 + vmovdqu8 192(%rcx,%r11,1),%zmm11{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %zmm15,%zmm4,%zmm4 + vpxorq %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %zmm15,%zmm4,%zmm4 + vaesenclast %zmm15,%zmm5,%zmm5 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %zmm10,%zmm4,%zmm4 + vpxorq %zmm11,%zmm5,%zmm5 + vextracti32x4 $2,%zmm5,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vpshufb %zmm29,%zmm3,%zmm7 + vpshufb %zmm29,%zmm4,%zmm10 + vpshufb %zmm29,%zmm5,%zmm11 + vextracti32x4 $2,%zmm11,%xmm13 + subq $16 * (15 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_hgEBfdDtdFvGqjb + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 112(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 176(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 240(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm11,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm11,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm11,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm11,%zmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_hgEBfdDtdFvGqjb +.L_small_initial_partial_block_hgEBfdDtdFvGqjb: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 128(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 192(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 256(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm11,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm11,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm11,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm11,%ymm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_hgEBfdDtdFvGqjb: + + orq %r8,%r8 + je .L_after_reduction_hgEBfdDtdFvGqjb + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_hgEBfdDtdFvGqjb: + jmp .L_small_initial_blocks_encrypted_tpcppgjkDAAGbmz +.L_small_initial_num_blocks_is_16_tpcppgjkDAAGbmz: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + vpaddd ddq_add_8888(%rip),%zmm3,%zmm5 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $192,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $3,%zmm5,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%zmm10 + vmovdqu8 192(%rcx,%r11,1),%zmm11{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %zmm15,%zmm4,%zmm4 + vpxorq %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %zmm15,%zmm4,%zmm4 + vaesenclast %zmm15,%zmm5,%zmm5 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %zmm10,%zmm4,%zmm4 + vpxorq %zmm11,%zmm5,%zmm5 + vextracti32x4 $3,%zmm5,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vpshufb %zmm29,%zmm3,%zmm7 + vpshufb %zmm29,%zmm4,%zmm10 + vpshufb %zmm29,%zmm5,%zmm11 + vextracti32x4 $3,%zmm11,%xmm13 + subq $16 * (16 - 1),%r8 +.L_small_initial_partial_block_mbufndcrlyapBCF: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 112(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 176(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 240(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm11,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm11,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm11,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm11,%zmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_mbufndcrlyapBCF: + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_mbufndcrlyapBCF: +.L_small_initial_blocks_encrypted_tpcppgjkDAAGbmz: +.L_ghash_done_tFbkipsuzBAeEGF: + vmovdqu64 %xmm2,0(%rsi) + vmovdqu64 %xmm14,64(%rsi) +.L_enc_dec_done_tFbkipsuzBAeEGF: + jmp .Lexit_gcm_encrypt +.align 32 +.Laes_gcm_encrypt_256_avx512: + orq %r8,%r8 + je .L_enc_dec_done_eawnuBpGmxcBoDC + xorq %r14,%r14 + vmovdqu64 64(%rsi),%xmm14 + + movq (%rdx),%r11 + orq %r11,%r11 + je .L_partial_block_done_yomlCiqlqyhGbxA + movl $16,%r10d + leaq byte_len_to_mask_table(%rip),%r12 + cmpq %r10,%r8 + cmovcq %r8,%r10 + kmovw (%r12,%r10,2),%k1 + vmovdqu8 (%rcx),%xmm0{%k1}{z} + + vmovdqu64 16(%rsi),%xmm3 + vmovdqu64 336(%rsi),%xmm4 + + + + leaq SHIFT_MASK(%rip),%r12 + addq %r11,%r12 + vmovdqu64 (%r12),%xmm5 + vpshufb %xmm5,%xmm3,%xmm3 + vpxorq %xmm0,%xmm3,%xmm3 + + + leaq (%r8,%r11,1),%r13 + subq $16,%r13 + jge .L_no_extra_mask_yomlCiqlqyhGbxA + subq %r13,%r12 +.L_no_extra_mask_yomlCiqlqyhGbxA: + + + + vmovdqu64 16(%r12),%xmm0 + vpand %xmm0,%xmm3,%xmm3 + vpshufb SHUF_MASK(%rip),%xmm3,%xmm3 + vpshufb %xmm5,%xmm3,%xmm3 + vpxorq %xmm3,%xmm14,%xmm14 + cmpq $0,%r13 + jl .L_partial_incomplete_yomlCiqlqyhGbxA + + vpclmulqdq $0x11,%xmm4,%xmm14,%xmm7 + vpclmulqdq $0x00,%xmm4,%xmm14,%xmm10 + vpclmulqdq $0x01,%xmm4,%xmm14,%xmm11 + vpclmulqdq $0x10,%xmm4,%xmm14,%xmm14 + vpxorq %xmm11,%xmm14,%xmm14 + + vpsrldq $8,%xmm14,%xmm11 + vpslldq $8,%xmm14,%xmm14 + vpxorq %xmm11,%xmm7,%xmm7 + vpxorq %xmm10,%xmm14,%xmm14 + + + + vmovdqu64 POLY2(%rip),%xmm11 + + vpclmulqdq $0x01,%xmm14,%xmm11,%xmm10 + vpslldq $8,%xmm10,%xmm10 + vpxorq %xmm10,%xmm14,%xmm14 + + + + vpclmulqdq $0x00,%xmm14,%xmm11,%xmm10 + vpsrldq $4,%xmm10,%xmm10 + vpclmulqdq $0x10,%xmm14,%xmm11,%xmm14 + vpslldq $4,%xmm14,%xmm14 + + vpternlogq $0x96,%xmm10,%xmm7,%xmm14 + + movq $0,(%rdx) + + movq %r11,%r12 + movq $16,%r11 + subq %r12,%r11 + jmp .L_enc_dec_done_yomlCiqlqyhGbxA + +.L_partial_incomplete_yomlCiqlqyhGbxA: + addq %r8,(%rdx) + movq %r8,%r11 + +.L_enc_dec_done_yomlCiqlqyhGbxA: + + + leaq byte_len_to_mask_table(%rip),%r12 + kmovw (%r12,%r11,2),%k1 + vmovdqu64 %xmm14,64(%rsi) + + vpshufb SHUF_MASK(%rip),%xmm3,%xmm3 + vpshufb %xmm5,%xmm3,%xmm3 + movq %r9,%r12 + vmovdqu8 %xmm3,(%r12){%k1} +.L_partial_block_done_yomlCiqlqyhGbxA: + vmovdqu64 0(%rsi),%xmm2 + subq %r11,%r8 + je .L_enc_dec_done_eawnuBpGmxcBoDC + cmpq $256,%r8 + jbe .L_message_below_equal_16_blocks_eawnuBpGmxcBoDC + + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vmovdqa64 ddq_addbe_4444(%rip),%zmm27 + vmovdqa64 ddq_addbe_1234(%rip),%zmm28 + + + + + + + vmovd %xmm2,%r15d + andl $255,%r15d + + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpshufb %zmm29,%zmm2,%zmm2 + + + + cmpb $240,%r15b + jae .L_next_16_overflow_iqGewgDgqvuhkra + vpaddd %zmm28,%zmm2,%zmm7 + vpaddd %zmm27,%zmm7,%zmm10 + vpaddd %zmm27,%zmm10,%zmm11 + vpaddd %zmm27,%zmm11,%zmm12 + jmp .L_next_16_ok_iqGewgDgqvuhkra +.L_next_16_overflow_iqGewgDgqvuhkra: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm12 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm7 + vpaddd %zmm12,%zmm7,%zmm10 + vpaddd %zmm12,%zmm10,%zmm11 + vpaddd %zmm12,%zmm11,%zmm12 + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %zmm29,%zmm10,%zmm10 + vpshufb %zmm29,%zmm11,%zmm11 + vpshufb %zmm29,%zmm12,%zmm12 +.L_next_16_ok_iqGewgDgqvuhkra: + vshufi64x2 $255,%zmm12,%zmm12,%zmm2 + addb $16,%r15b + + vmovdqu8 0(%rcx,%r11,1),%zmm0 + vmovdqu8 64(%rcx,%r11,1),%zmm3 + vmovdqu8 128(%rcx,%r11,1),%zmm4 + vmovdqu8 192(%rcx,%r11,1),%zmm5 + + + vbroadcastf64x2 0(%rdi),%zmm6 + vpxorq %zmm6,%zmm7,%zmm7 + vpxorq %zmm6,%zmm10,%zmm10 + vpxorq %zmm6,%zmm11,%zmm11 + vpxorq %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 16(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 32(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 48(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 64(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 80(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 96(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 112(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 128(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 144(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 160(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 176(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 192(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 208(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 224(%rdi),%zmm6 + vaesenclast %zmm6,%zmm7,%zmm7 + vaesenclast %zmm6,%zmm10,%zmm10 + vaesenclast %zmm6,%zmm11,%zmm11 + vaesenclast %zmm6,%zmm12,%zmm12 + + + vpxorq %zmm0,%zmm7,%zmm7 + vpxorq %zmm3,%zmm10,%zmm10 + vpxorq %zmm4,%zmm11,%zmm11 + vpxorq %zmm5,%zmm12,%zmm12 + + + movq %r9,%r10 + vmovdqu8 %zmm7,0(%r10,%r11,1) + vmovdqu8 %zmm10,64(%r10,%r11,1) + vmovdqu8 %zmm11,128(%r10,%r11,1) + vmovdqu8 %zmm12,192(%r10,%r11,1) + + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %zmm29,%zmm10,%zmm10 + vpshufb %zmm29,%zmm11,%zmm11 + vpshufb %zmm29,%zmm12,%zmm12 + vmovdqa64 %zmm7,768(%rsp) + vmovdqa64 %zmm10,832(%rsp) + vmovdqa64 %zmm11,896(%rsp) + vmovdqa64 %zmm12,960(%rsp) + testq %r14,%r14 + jnz .L_skip_hkeys_precomputation_alwniGiGuuwbdou + + vmovdqu64 288(%rsi),%zmm0 + vmovdqu64 %zmm0,704(%rsp) + + vmovdqu64 224(%rsi),%zmm3 + vmovdqu64 %zmm3,640(%rsp) + + + vshufi64x2 $0x00,%zmm3,%zmm3,%zmm3 + + vmovdqu64 160(%rsi),%zmm4 + vmovdqu64 %zmm4,576(%rsp) + + vmovdqu64 96(%rsi),%zmm5 + vmovdqu64 %zmm5,512(%rsp) +.L_skip_hkeys_precomputation_alwniGiGuuwbdou: + cmpq $512,%r8 + jb .L_message_below_32_blocks_eawnuBpGmxcBoDC + + + + cmpb $240,%r15b + jae .L_next_16_overflow_wkhDhbijnuGGCmD + vpaddd %zmm28,%zmm2,%zmm7 + vpaddd %zmm27,%zmm7,%zmm10 + vpaddd %zmm27,%zmm10,%zmm11 + vpaddd %zmm27,%zmm11,%zmm12 + jmp .L_next_16_ok_wkhDhbijnuGGCmD +.L_next_16_overflow_wkhDhbijnuGGCmD: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm12 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm7 + vpaddd %zmm12,%zmm7,%zmm10 + vpaddd %zmm12,%zmm10,%zmm11 + vpaddd %zmm12,%zmm11,%zmm12 + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %zmm29,%zmm10,%zmm10 + vpshufb %zmm29,%zmm11,%zmm11 + vpshufb %zmm29,%zmm12,%zmm12 +.L_next_16_ok_wkhDhbijnuGGCmD: + vshufi64x2 $255,%zmm12,%zmm12,%zmm2 + addb $16,%r15b + + vmovdqu8 256(%rcx,%r11,1),%zmm0 + vmovdqu8 320(%rcx,%r11,1),%zmm3 + vmovdqu8 384(%rcx,%r11,1),%zmm4 + vmovdqu8 448(%rcx,%r11,1),%zmm5 + + + vbroadcastf64x2 0(%rdi),%zmm6 + vpxorq %zmm6,%zmm7,%zmm7 + vpxorq %zmm6,%zmm10,%zmm10 + vpxorq %zmm6,%zmm11,%zmm11 + vpxorq %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 16(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 32(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 48(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 64(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 80(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 96(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 112(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 128(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 144(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 160(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 176(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 192(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 208(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 224(%rdi),%zmm6 + vaesenclast %zmm6,%zmm7,%zmm7 + vaesenclast %zmm6,%zmm10,%zmm10 + vaesenclast %zmm6,%zmm11,%zmm11 + vaesenclast %zmm6,%zmm12,%zmm12 + + + vpxorq %zmm0,%zmm7,%zmm7 + vpxorq %zmm3,%zmm10,%zmm10 + vpxorq %zmm4,%zmm11,%zmm11 + vpxorq %zmm5,%zmm12,%zmm12 + + + movq %r9,%r10 + vmovdqu8 %zmm7,256(%r10,%r11,1) + vmovdqu8 %zmm10,320(%r10,%r11,1) + vmovdqu8 %zmm11,384(%r10,%r11,1) + vmovdqu8 %zmm12,448(%r10,%r11,1) + + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %zmm29,%zmm10,%zmm10 + vpshufb %zmm29,%zmm11,%zmm11 + vpshufb %zmm29,%zmm12,%zmm12 + vmovdqa64 %zmm7,1024(%rsp) + vmovdqa64 %zmm10,1088(%rsp) + vmovdqa64 %zmm11,1152(%rsp) + vmovdqa64 %zmm12,1216(%rsp) + testq %r14,%r14 + jnz .L_skip_hkeys_precomputation_xuEcimfukbaBqDu + vmovdqu64 640(%rsp),%zmm3 + + + vshufi64x2 $0x00,%zmm3,%zmm3,%zmm3 + + vmovdqu64 576(%rsp),%zmm4 + vmovdqu64 512(%rsp),%zmm5 + + vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4 + vpxorq %zmm10,%zmm4,%zmm4 + + vpsrldq $8,%zmm4,%zmm10 + vpslldq $8,%zmm4,%zmm4 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4 + vpslldq $4,%zmm4,%zmm4 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm4 + + vmovdqu64 %zmm4,448(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5 + vpxorq %zmm10,%zmm5,%zmm5 + + vpsrldq $8,%zmm5,%zmm10 + vpslldq $8,%zmm5,%zmm5 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5 + vpslldq $4,%zmm5,%zmm5 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm5 + + vmovdqu64 %zmm5,384(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4 + vpxorq %zmm10,%zmm4,%zmm4 + + vpsrldq $8,%zmm4,%zmm10 + vpslldq $8,%zmm4,%zmm4 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4 + vpslldq $4,%zmm4,%zmm4 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm4 + + vmovdqu64 %zmm4,320(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5 + vpxorq %zmm10,%zmm5,%zmm5 + + vpsrldq $8,%zmm5,%zmm10 + vpslldq $8,%zmm5,%zmm5 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5 + vpslldq $4,%zmm5,%zmm5 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm5 + + vmovdqu64 %zmm5,256(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4 + vpxorq %zmm10,%zmm4,%zmm4 + + vpsrldq $8,%zmm4,%zmm10 + vpslldq $8,%zmm4,%zmm4 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4 + vpslldq $4,%zmm4,%zmm4 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm4 + + vmovdqu64 %zmm4,192(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5 + vpxorq %zmm10,%zmm5,%zmm5 + + vpsrldq $8,%zmm5,%zmm10 + vpslldq $8,%zmm5,%zmm5 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5 + vpslldq $4,%zmm5,%zmm5 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm5 + + vmovdqu64 %zmm5,128(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4 + vpxorq %zmm10,%zmm4,%zmm4 + + vpsrldq $8,%zmm4,%zmm10 + vpslldq $8,%zmm4,%zmm4 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4 + vpslldq $4,%zmm4,%zmm4 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm4 + + vmovdqu64 %zmm4,64(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5 + vpxorq %zmm10,%zmm5,%zmm5 + + vpsrldq $8,%zmm5,%zmm10 + vpslldq $8,%zmm5,%zmm5 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5 + vpslldq $4,%zmm5,%zmm5 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm5 + + vmovdqu64 %zmm5,0(%rsp) +.L_skip_hkeys_precomputation_xuEcimfukbaBqDu: + movq $1,%r14 + addq $512,%r11 + subq $512,%r8 + + cmpq $768,%r8 + jb .L_no_more_big_nblocks_eawnuBpGmxcBoDC +.L_encrypt_big_nblocks_eawnuBpGmxcBoDC: + cmpb $240,%r15b + jae .L_16_blocks_overflow_hsjyfxApibhdaao + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_hsjyfxApibhdaao +.L_16_blocks_overflow_hsjyfxApibhdaao: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_hsjyfxApibhdaao: + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp),%zmm1 + + + + + vshufi64x2 $255,%zmm5,%zmm5,%zmm2 + addb $16,%r15b + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + + + + + + + + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm6 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + + + + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm15,%zmm10,%zmm26 + vpxorq %zmm12,%zmm6,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 208(%rdi),%zmm31 + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 224(%rdi),%zmm30 + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + + + + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + + + + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1) + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 + vmovdqa64 %zmm0,1280(%rsp) + vmovdqa64 %zmm3,1344(%rsp) + vmovdqa64 %zmm4,1408(%rsp) + vmovdqa64 %zmm5,1472(%rsp) + cmpb $240,%r15b + jae .L_16_blocks_overflow_FyafAtAzhgGauwk + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_FyafAtAzhgGauwk +.L_16_blocks_overflow_FyafAtAzhgGauwk: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_FyafAtAzhgGauwk: + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 256(%rsp),%zmm1 + + + + + vshufi64x2 $255,%zmm5,%zmm5,%zmm2 + addb $16,%r15b + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 320(%rsp),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + + + + + + + + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 384(%rsp),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 448(%rsp),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm6 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + + + + vmovdqu8 256(%rcx,%r11,1),%zmm17 + vmovdqu8 320(%rcx,%r11,1),%zmm19 + vmovdqu8 384(%rcx,%r11,1),%zmm20 + vmovdqu8 448(%rcx,%r11,1),%zmm21 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vpternlogq $0x96,%zmm12,%zmm6,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 208(%rdi),%zmm31 + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 224(%rdi),%zmm30 + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + + + + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + + + + movq %r9,%r10 + vmovdqu8 %zmm0,256(%r10,%r11,1) + vmovdqu8 %zmm3,320(%r10,%r11,1) + vmovdqu8 %zmm4,384(%r10,%r11,1) + vmovdqu8 %zmm5,448(%r10,%r11,1) + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 + vmovdqa64 %zmm0,768(%rsp) + vmovdqa64 %zmm3,832(%rsp) + vmovdqa64 %zmm4,896(%rsp) + vmovdqa64 %zmm5,960(%rsp) + cmpb $240,%r15b + jae .L_16_blocks_overflow_mshygnywvbAbxuk + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_mshygnywvbAbxuk +.L_16_blocks_overflow_mshygnywvbAbxuk: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_mshygnywvbAbxuk: + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + + + + + vshufi64x2 $255,%zmm5,%zmm5,%zmm2 + addb $16,%r15b + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + + + + + + + + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm6 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + + + + vmovdqu8 512(%rcx,%r11,1),%zmm17 + vmovdqu8 576(%rcx,%r11,1),%zmm19 + vmovdqu8 640(%rcx,%r11,1),%zmm20 + vmovdqu8 704(%rcx,%r11,1),%zmm21 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + + + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vpternlogq $0x96,%zmm15,%zmm12,%zmm6 + vpxorq %zmm24,%zmm6,%zmm6 + vpternlogq $0x96,%zmm10,%zmm13,%zmm7 + vpxorq %zmm25,%zmm7,%zmm7 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vextracti64x4 $1,%zmm6,%ymm12 + vpxorq %ymm12,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm12 + vpxorq %xmm12,%xmm6,%xmm6 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 208(%rdi),%zmm31 + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 224(%rdi),%zmm30 + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm6 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + + + + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + + + + movq %r9,%r10 + vmovdqu8 %zmm0,512(%r10,%r11,1) + vmovdqu8 %zmm3,576(%r10,%r11,1) + vmovdqu8 %zmm4,640(%r10,%r11,1) + vmovdqu8 %zmm5,704(%r10,%r11,1) + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 + vmovdqa64 %zmm0,1024(%rsp) + vmovdqa64 %zmm3,1088(%rsp) + vmovdqa64 %zmm4,1152(%rsp) + vmovdqa64 %zmm5,1216(%rsp) + vmovdqa64 %zmm6,%zmm14 + + addq $768,%r11 + subq $768,%r8 + cmpq $768,%r8 + jae .L_encrypt_big_nblocks_eawnuBpGmxcBoDC + +.L_no_more_big_nblocks_eawnuBpGmxcBoDC: + + cmpq $512,%r8 + jae .L_encrypt_32_blocks_eawnuBpGmxcBoDC + + cmpq $256,%r8 + jae .L_encrypt_16_blocks_eawnuBpGmxcBoDC +.L_encrypt_0_blocks_ghash_32_eawnuBpGmxcBoDC: + movl %r8d,%r10d + andl $~15,%r10d + movl $256,%ebx + subl %r10d,%ebx + vmovdqa64 768(%rsp),%zmm13 + vpxorq %zmm14,%zmm13,%zmm13 + vmovdqu64 0(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 832(%rsp),%zmm13 + vmovdqu64 64(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpxorq %zmm10,%zmm4,%zmm26 + vpxorq %zmm6,%zmm0,%zmm24 + vpxorq %zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 896(%rsp),%zmm13 + vmovdqu64 128(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 960(%rsp),%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + addl $256,%ebx + movl %r8d,%r10d + addl $15,%r10d + shrl $4,%r10d + je .L_last_num_blocks_is_0_CAikcjdGDugFfth + + cmpl $8,%r10d + je .L_last_num_blocks_is_8_CAikcjdGDugFfth + jb .L_last_num_blocks_is_7_1_CAikcjdGDugFfth + + + cmpl $12,%r10d + je .L_last_num_blocks_is_12_CAikcjdGDugFfth + jb .L_last_num_blocks_is_11_9_CAikcjdGDugFfth + + + cmpl $15,%r10d + je .L_last_num_blocks_is_15_CAikcjdGDugFfth + ja .L_last_num_blocks_is_16_CAikcjdGDugFfth + cmpl $14,%r10d + je .L_last_num_blocks_is_14_CAikcjdGDugFfth + jmp .L_last_num_blocks_is_13_CAikcjdGDugFfth + +.L_last_num_blocks_is_11_9_CAikcjdGDugFfth: + + cmpl $10,%r10d + je .L_last_num_blocks_is_10_CAikcjdGDugFfth + ja .L_last_num_blocks_is_11_CAikcjdGDugFfth + jmp .L_last_num_blocks_is_9_CAikcjdGDugFfth + +.L_last_num_blocks_is_7_1_CAikcjdGDugFfth: + cmpl $4,%r10d + je .L_last_num_blocks_is_4_CAikcjdGDugFfth + jb .L_last_num_blocks_is_3_1_CAikcjdGDugFfth + + cmpl $6,%r10d + ja .L_last_num_blocks_is_7_CAikcjdGDugFfth + je .L_last_num_blocks_is_6_CAikcjdGDugFfth + jmp .L_last_num_blocks_is_5_CAikcjdGDugFfth + +.L_last_num_blocks_is_3_1_CAikcjdGDugFfth: + + cmpl $2,%r10d + ja .L_last_num_blocks_is_3_CAikcjdGDugFfth + je .L_last_num_blocks_is_2_CAikcjdGDugFfth +.L_last_num_blocks_is_1_CAikcjdGDugFfth: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $255,%r15d + jae .L_16_blocks_overflow_xFvljgxvqrrjiEx + vpaddd %xmm28,%xmm2,%xmm0 + jmp .L_16_blocks_ok_xFvljgxvqrrjiEx + +.L_16_blocks_overflow_xFvljgxvqrrjiEx: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %xmm29,%xmm0,%xmm0 +.L_16_blocks_ok_xFvljgxvqrrjiEx: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z} + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vaesenclast %xmm30,%xmm0,%xmm0 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti32x4 $0,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %xmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %xmm29,%xmm0,%xmm17 + vextracti32x4 $0,%zmm17,%xmm7 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_qxurhxfinuxAakr + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_qxurhxfinuxAakr +.L_small_initial_partial_block_qxurhxfinuxAakr: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm0 + + + vpclmulqdq $0x01,%xmm25,%xmm0,%xmm3 + vpslldq $8,%xmm3,%xmm3 + vpxorq %xmm3,%xmm25,%xmm3 + + + vpclmulqdq $0x00,%xmm3,%xmm0,%xmm4 + vpsrldq $4,%xmm4,%xmm4 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm4,%xmm14 + + + + + + + + + + + + + vpxorq %xmm7,%xmm14,%xmm14 + + jmp .L_after_reduction_qxurhxfinuxAakr +.L_small_initial_compute_done_qxurhxfinuxAakr: +.L_after_reduction_qxurhxfinuxAakr: + jmp .L_last_blocks_done_CAikcjdGDugFfth +.L_last_num_blocks_is_2_CAikcjdGDugFfth: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $254,%r15d + jae .L_16_blocks_overflow_jkwkgdBwnfqtmoz + vpaddd %ymm28,%ymm2,%ymm0 + jmp .L_16_blocks_ok_jkwkgdBwnfqtmoz + +.L_16_blocks_overflow_jkwkgdBwnfqtmoz: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %ymm29,%ymm0,%ymm0 +.L_16_blocks_ok_jkwkgdBwnfqtmoz: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z} + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vaesenclast %ymm30,%ymm0,%ymm0 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %ymm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %ymm29,%ymm0,%ymm17 + vextracti32x4 $1,%zmm17,%xmm7 + subq $16 * (2 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_FuEgfclAfodbltt + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_FuEgfclAfodbltt +.L_small_initial_partial_block_FuEgfclAfodbltt: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_FuEgfclAfodbltt: + + orq %r8,%r8 + je .L_after_reduction_FuEgfclAfodbltt + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_FuEgfclAfodbltt: + jmp .L_last_blocks_done_CAikcjdGDugFfth +.L_last_num_blocks_is_3_CAikcjdGDugFfth: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $253,%r15d + jae .L_16_blocks_overflow_rlpicECjalEogkA + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_rlpicECjalEogkA + +.L_16_blocks_overflow_rlpicECjalEogkA: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_rlpicECjalEogkA: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $2,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vextracti32x4 $2,%zmm17,%xmm7 + subq $16 * (3 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_CuzDDhbEvttwEEk + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_CuzDDhbEvttwEEk +.L_small_initial_partial_block_CuzDDhbEvttwEEk: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_CuzDDhbEvttwEEk: + + orq %r8,%r8 + je .L_after_reduction_CuzDDhbEvttwEEk + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_CuzDDhbEvttwEEk: + jmp .L_last_blocks_done_CAikcjdGDugFfth +.L_last_num_blocks_is_4_CAikcjdGDugFfth: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $252,%r15d + jae .L_16_blocks_overflow_gqkAClvbnegzAmA + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_gqkAClvbnegzAmA + +.L_16_blocks_overflow_gqkAClvbnegzAmA: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_gqkAClvbnegzAmA: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $3,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vextracti32x4 $3,%zmm17,%xmm7 + subq $16 * (4 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_xcnzwhtrnbgDqfy + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_xcnzwhtrnbgDqfy +.L_small_initial_partial_block_xcnzwhtrnbgDqfy: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_xcnzwhtrnbgDqfy: + + orq %r8,%r8 + je .L_after_reduction_xcnzwhtrnbgDqfy + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_xcnzwhtrnbgDqfy: + jmp .L_last_blocks_done_CAikcjdGDugFfth +.L_last_num_blocks_is_5_CAikcjdGDugFfth: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $251,%r15d + jae .L_16_blocks_overflow_FklAbbifjuDAcpD + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %xmm27,%xmm0,%xmm3 + jmp .L_16_blocks_ok_FklAbbifjuDAcpD + +.L_16_blocks_overflow_FklAbbifjuDAcpD: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %xmm29,%xmm3,%xmm3 +.L_16_blocks_ok_FklAbbifjuDAcpD: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %xmm30,%xmm3,%xmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %xmm19,%xmm3,%xmm3 + vextracti32x4 $0,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %xmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %xmm29,%xmm3,%xmm19 + vextracti32x4 $0,%zmm19,%xmm7 + subq $16 * (5 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_oxoctmohDgCBefA + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_oxoctmohDgCBefA +.L_small_initial_partial_block_oxoctmohDgCBefA: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_oxoctmohDgCBefA: + + orq %r8,%r8 + je .L_after_reduction_oxoctmohDgCBefA + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_oxoctmohDgCBefA: + jmp .L_last_blocks_done_CAikcjdGDugFfth +.L_last_num_blocks_is_6_CAikcjdGDugFfth: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $250,%r15d + jae .L_16_blocks_overflow_odCCAydbBFAapzd + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %ymm27,%ymm0,%ymm3 + jmp .L_16_blocks_ok_odCCAydbBFAapzd + +.L_16_blocks_overflow_odCCAydbBFAapzd: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %ymm29,%ymm3,%ymm3 +.L_16_blocks_ok_odCCAydbBFAapzd: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %ymm30,%ymm3,%ymm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %ymm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %ymm29,%ymm3,%ymm19 + vextracti32x4 $1,%zmm19,%xmm7 + subq $16 * (6 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_qlwikcksldoilrG + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_qlwikcksldoilrG +.L_small_initial_partial_block_qlwikcksldoilrG: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_qlwikcksldoilrG: + + orq %r8,%r8 + je .L_after_reduction_qlwikcksldoilrG + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_qlwikcksldoilrG: + jmp .L_last_blocks_done_CAikcjdGDugFfth +.L_last_num_blocks_is_7_CAikcjdGDugFfth: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $249,%r15d + jae .L_16_blocks_overflow_mjwDlmhvzElddng + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_mjwDlmhvzElddng + +.L_16_blocks_overflow_mjwDlmhvzElddng: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_mjwDlmhvzElddng: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $2,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vextracti32x4 $2,%zmm19,%xmm7 + subq $16 * (7 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_unqgfDFcvabkGta + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_unqgfDFcvabkGta +.L_small_initial_partial_block_unqgfDFcvabkGta: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_unqgfDFcvabkGta: + + orq %r8,%r8 + je .L_after_reduction_unqgfDFcvabkGta + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_unqgfDFcvabkGta: + jmp .L_last_blocks_done_CAikcjdGDugFfth +.L_last_num_blocks_is_8_CAikcjdGDugFfth: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $248,%r15d + jae .L_16_blocks_overflow_EinBcyEEyChknsj + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_EinBcyEEyChknsj + +.L_16_blocks_overflow_EinBcyEEyChknsj: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_EinBcyEEyChknsj: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $3,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vextracti32x4 $3,%zmm19,%xmm7 + subq $16 * (8 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_ejuhaaqjamhcjqF + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_ejuhaaqjamhcjqF +.L_small_initial_partial_block_ejuhaaqjamhcjqF: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_ejuhaaqjamhcjqF: + + orq %r8,%r8 + je .L_after_reduction_ejuhaaqjamhcjqF + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_ejuhaaqjamhcjqF: + jmp .L_last_blocks_done_CAikcjdGDugFfth +.L_last_num_blocks_is_9_CAikcjdGDugFfth: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $247,%r15d + jae .L_16_blocks_overflow_mhxEmCxxjyDqdDo + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %xmm27,%xmm3,%xmm4 + jmp .L_16_blocks_ok_mhxEmCxxjyDqdDo + +.L_16_blocks_overflow_mhxEmCxxjyDqdDo: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %xmm29,%xmm4,%xmm4 +.L_16_blocks_ok_mhxEmCxxjyDqdDo: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %xmm30,%xmm4,%xmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %xmm20,%xmm4,%xmm4 + vextracti32x4 $0,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %xmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %xmm29,%xmm4,%xmm20 + vextracti32x4 $0,%zmm20,%xmm7 + subq $16 * (9 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_zdofzxhsAexptkx + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_zdofzxhsAexptkx +.L_small_initial_partial_block_zdofzxhsAexptkx: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_zdofzxhsAexptkx: + + orq %r8,%r8 + je .L_after_reduction_zdofzxhsAexptkx + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_zdofzxhsAexptkx: + jmp .L_last_blocks_done_CAikcjdGDugFfth +.L_last_num_blocks_is_10_CAikcjdGDugFfth: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $246,%r15d + jae .L_16_blocks_overflow_rvskGvkumwEhhsc + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %ymm27,%ymm3,%ymm4 + jmp .L_16_blocks_ok_rvskGvkumwEhhsc + +.L_16_blocks_overflow_rvskGvkumwEhhsc: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %ymm29,%ymm4,%ymm4 +.L_16_blocks_ok_rvskGvkumwEhhsc: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %ymm30,%ymm4,%ymm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %ymm20,%ymm4,%ymm4 + vextracti32x4 $1,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %ymm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %ymm29,%ymm4,%ymm20 + vextracti32x4 $1,%zmm20,%xmm7 + subq $16 * (10 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_gngjmGDkBquyveG + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_gngjmGDkBquyveG +.L_small_initial_partial_block_gngjmGDkBquyveG: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_gngjmGDkBquyveG: + + orq %r8,%r8 + je .L_after_reduction_gngjmGDkBquyveG + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_gngjmGDkBquyveG: + jmp .L_last_blocks_done_CAikcjdGDugFfth +.L_last_num_blocks_is_11_CAikcjdGDugFfth: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $245,%r15d + jae .L_16_blocks_overflow_Dtnnktpbavbarsp + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_Dtnnktpbavbarsp + +.L_16_blocks_overflow_Dtnnktpbavbarsp: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_Dtnnktpbavbarsp: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $2,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vextracti32x4 $2,%zmm20,%xmm7 + subq $16 * (11 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_xfvylkhgAonGlpn + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_xfvylkhgAonGlpn +.L_small_initial_partial_block_xfvylkhgAonGlpn: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_xfvylkhgAonGlpn: + + orq %r8,%r8 + je .L_after_reduction_xfvylkhgAonGlpn + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_xfvylkhgAonGlpn: + jmp .L_last_blocks_done_CAikcjdGDugFfth +.L_last_num_blocks_is_12_CAikcjdGDugFfth: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $244,%r15d + jae .L_16_blocks_overflow_bpklztjgEEdhFxz + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_bpklztjgEEdhFxz + +.L_16_blocks_overflow_bpklztjgEEdhFxz: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_bpklztjgEEdhFxz: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $3,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vextracti32x4 $3,%zmm20,%xmm7 + subq $16 * (12 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_dgtbwzqgvnDyDmt + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_dgtbwzqgvnDyDmt +.L_small_initial_partial_block_dgtbwzqgvnDyDmt: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_dgtbwzqgvnDyDmt: + + orq %r8,%r8 + je .L_after_reduction_dgtbwzqgvnDyDmt + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_dgtbwzqgvnDyDmt: + jmp .L_last_blocks_done_CAikcjdGDugFfth +.L_last_num_blocks_is_13_CAikcjdGDugFfth: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $243,%r15d + jae .L_16_blocks_overflow_BBkhDhGlvcaehas + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %xmm27,%xmm4,%xmm5 + jmp .L_16_blocks_ok_BBkhDhGlvcaehas + +.L_16_blocks_overflow_BBkhDhGlvcaehas: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %xmm29,%xmm5,%xmm5 +.L_16_blocks_ok_BBkhDhGlvcaehas: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %xmm30,%xmm5,%xmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %xmm21,%xmm5,%xmm5 + vextracti32x4 $0,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %xmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %xmm29,%xmm5,%xmm21 + vextracti32x4 $0,%zmm21,%xmm7 + subq $16 * (13 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_euhapEbhfhxemzw + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_euhapEbhfhxemzw +.L_small_initial_partial_block_euhapEbhfhxemzw: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_euhapEbhfhxemzw: + + orq %r8,%r8 + je .L_after_reduction_euhapEbhfhxemzw + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_euhapEbhfhxemzw: + jmp .L_last_blocks_done_CAikcjdGDugFfth +.L_last_num_blocks_is_14_CAikcjdGDugFfth: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $242,%r15d + jae .L_16_blocks_overflow_wFmlAewyxkiABzu + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %ymm27,%ymm4,%ymm5 + jmp .L_16_blocks_ok_wFmlAewyxkiABzu + +.L_16_blocks_overflow_wFmlAewyxkiABzu: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %ymm29,%ymm5,%ymm5 +.L_16_blocks_ok_wFmlAewyxkiABzu: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %ymm30,%ymm5,%ymm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %ymm21,%ymm5,%ymm5 + vextracti32x4 $1,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %ymm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %ymm29,%ymm5,%ymm21 + vextracti32x4 $1,%zmm21,%xmm7 + subq $16 * (14 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_xleiaowmorzhxfq + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_xleiaowmorzhxfq +.L_small_initial_partial_block_xleiaowmorzhxfq: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_xleiaowmorzhxfq: + + orq %r8,%r8 + je .L_after_reduction_xleiaowmorzhxfq + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_xleiaowmorzhxfq: + jmp .L_last_blocks_done_CAikcjdGDugFfth +.L_last_num_blocks_is_15_CAikcjdGDugFfth: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $241,%r15d + jae .L_16_blocks_overflow_fwmFnlmCbhngvtq + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_fwmFnlmCbhngvtq + +.L_16_blocks_overflow_fwmFnlmCbhngvtq: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_fwmFnlmCbhngvtq: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $2,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %zmm29,%zmm5,%zmm21 + vextracti32x4 $2,%zmm21,%xmm7 + subq $16 * (15 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_Cwwewmiesghaixp + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_Cwwewmiesghaixp +.L_small_initial_partial_block_Cwwewmiesghaixp: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_Cwwewmiesghaixp: + + orq %r8,%r8 + je .L_after_reduction_Cwwewmiesghaixp + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_Cwwewmiesghaixp: + jmp .L_last_blocks_done_CAikcjdGDugFfth +.L_last_num_blocks_is_16_CAikcjdGDugFfth: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $240,%r15d + jae .L_16_blocks_overflow_xEdGzjmGszadGFy + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_xEdGzjmGszadGFy + +.L_16_blocks_overflow_xEdGzjmGszadGFy: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_xEdGzjmGszadGFy: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $3,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %zmm29,%zmm5,%zmm21 + vextracti32x4 $3,%zmm21,%xmm7 + subq $16 * (16 - 1),%r8 +.L_small_initial_partial_block_fphazgGgmEuxiEi: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_fphazgGgmEuxiEi: + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_fphazgGgmEuxiEi: + jmp .L_last_blocks_done_CAikcjdGDugFfth +.L_last_num_blocks_is_0_CAikcjdGDugFfth: + vmovdqa64 1024(%rsp),%zmm13 + vmovdqu64 0(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1088(%rsp),%zmm13 + vmovdqu64 64(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 1152(%rsp),%zmm13 + vmovdqu64 128(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1216(%rsp),%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm4 + + + vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0 + vpslldq $8,%xmm0,%xmm0 + vpxorq %xmm0,%xmm25,%xmm0 + + + vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3 + vpsrldq $4,%xmm3,%xmm3 + vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm3,%xmm14 + +.L_last_blocks_done_CAikcjdGDugFfth: + vpshufb %xmm29,%xmm2,%xmm2 + jmp .L_ghash_done_eawnuBpGmxcBoDC +.L_encrypt_32_blocks_eawnuBpGmxcBoDC: + cmpb $240,%r15b + jae .L_16_blocks_overflow_fxEfrxCahjuywkw + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_fxEfrxCahjuywkw +.L_16_blocks_overflow_fxEfrxCahjuywkw: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_fxEfrxCahjuywkw: + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp),%zmm1 + + + + + vshufi64x2 $255,%zmm5,%zmm5,%zmm2 + addb $16,%r15b + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + + + + + + + + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm6 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + + + + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm15,%zmm10,%zmm26 + vpxorq %zmm12,%zmm6,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 208(%rdi),%zmm31 + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 224(%rdi),%zmm30 + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + + + + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + + + + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1) + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 + vmovdqa64 %zmm0,1280(%rsp) + vmovdqa64 %zmm3,1344(%rsp) + vmovdqa64 %zmm4,1408(%rsp) + vmovdqa64 %zmm5,1472(%rsp) + cmpb $240,%r15b + jae .L_16_blocks_overflow_iwxfgjgfFyEczhg + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_iwxfgjgfFyEczhg +.L_16_blocks_overflow_iwxfgjgfFyEczhg: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_iwxfgjgfFyEczhg: + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 256(%rsp),%zmm1 + + + + + vshufi64x2 $255,%zmm5,%zmm5,%zmm2 + addb $16,%r15b + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 320(%rsp),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + + + + + + + + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 384(%rsp),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 448(%rsp),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm6 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + + + + vmovdqu8 256(%rcx,%r11,1),%zmm17 + vmovdqu8 320(%rcx,%r11,1),%zmm19 + vmovdqu8 384(%rcx,%r11,1),%zmm20 + vmovdqu8 448(%rcx,%r11,1),%zmm21 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vpternlogq $0x96,%zmm12,%zmm6,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 208(%rdi),%zmm31 + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 224(%rdi),%zmm30 + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + + + + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + + + + movq %r9,%r10 + vmovdqu8 %zmm0,256(%r10,%r11,1) + vmovdqu8 %zmm3,320(%r10,%r11,1) + vmovdqu8 %zmm4,384(%r10,%r11,1) + vmovdqu8 %zmm5,448(%r10,%r11,1) + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 + vmovdqa64 %zmm0,768(%rsp) + vmovdqa64 %zmm3,832(%rsp) + vmovdqa64 %zmm4,896(%rsp) + vmovdqa64 %zmm5,960(%rsp) + vmovdqa64 1280(%rsp),%zmm13 + vmovdqu64 512(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1344(%rsp),%zmm13 + vmovdqu64 576(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 1408(%rsp),%zmm13 + vmovdqu64 640(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1472(%rsp),%zmm13 + vmovdqu64 704(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm4 + + + vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0 + vpslldq $8,%xmm0,%xmm0 + vpxorq %xmm0,%xmm25,%xmm0 + + + vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3 + vpsrldq $4,%xmm3,%xmm3 + vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm3,%xmm14 + + subq $512,%r8 + addq $512,%r11 + movl %r8d,%r10d + andl $~15,%r10d + movl $512,%ebx + subl %r10d,%ebx + movl %r8d,%r10d + addl $15,%r10d + shrl $4,%r10d + je .L_last_num_blocks_is_0_muvbsvrgtnhDwuC + + cmpl $8,%r10d + je .L_last_num_blocks_is_8_muvbsvrgtnhDwuC + jb .L_last_num_blocks_is_7_1_muvbsvrgtnhDwuC + + + cmpl $12,%r10d + je .L_last_num_blocks_is_12_muvbsvrgtnhDwuC + jb .L_last_num_blocks_is_11_9_muvbsvrgtnhDwuC + + + cmpl $15,%r10d + je .L_last_num_blocks_is_15_muvbsvrgtnhDwuC + ja .L_last_num_blocks_is_16_muvbsvrgtnhDwuC + cmpl $14,%r10d + je .L_last_num_blocks_is_14_muvbsvrgtnhDwuC + jmp .L_last_num_blocks_is_13_muvbsvrgtnhDwuC + +.L_last_num_blocks_is_11_9_muvbsvrgtnhDwuC: + + cmpl $10,%r10d + je .L_last_num_blocks_is_10_muvbsvrgtnhDwuC + ja .L_last_num_blocks_is_11_muvbsvrgtnhDwuC + jmp .L_last_num_blocks_is_9_muvbsvrgtnhDwuC + +.L_last_num_blocks_is_7_1_muvbsvrgtnhDwuC: + cmpl $4,%r10d + je .L_last_num_blocks_is_4_muvbsvrgtnhDwuC + jb .L_last_num_blocks_is_3_1_muvbsvrgtnhDwuC + + cmpl $6,%r10d + ja .L_last_num_blocks_is_7_muvbsvrgtnhDwuC + je .L_last_num_blocks_is_6_muvbsvrgtnhDwuC + jmp .L_last_num_blocks_is_5_muvbsvrgtnhDwuC + +.L_last_num_blocks_is_3_1_muvbsvrgtnhDwuC: + + cmpl $2,%r10d + ja .L_last_num_blocks_is_3_muvbsvrgtnhDwuC + je .L_last_num_blocks_is_2_muvbsvrgtnhDwuC +.L_last_num_blocks_is_1_muvbsvrgtnhDwuC: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $255,%r15d + jae .L_16_blocks_overflow_sCioAEgxkAkBsms + vpaddd %xmm28,%xmm2,%xmm0 + jmp .L_16_blocks_ok_sCioAEgxkAkBsms + +.L_16_blocks_overflow_sCioAEgxkAkBsms: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %xmm29,%xmm0,%xmm0 +.L_16_blocks_ok_sCioAEgxkAkBsms: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z} + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vaesenclast %xmm30,%xmm0,%xmm0 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti32x4 $0,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %xmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %xmm29,%xmm0,%xmm17 + vextracti32x4 $0,%zmm17,%xmm7 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_iuEEnvAblnyuBEp + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_iuEEnvAblnyuBEp +.L_small_initial_partial_block_iuEEnvAblnyuBEp: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm0 + + + vpclmulqdq $0x01,%xmm25,%xmm0,%xmm3 + vpslldq $8,%xmm3,%xmm3 + vpxorq %xmm3,%xmm25,%xmm3 + + + vpclmulqdq $0x00,%xmm3,%xmm0,%xmm4 + vpsrldq $4,%xmm4,%xmm4 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm4,%xmm14 + + + + + + + + + + + + + vpxorq %xmm7,%xmm14,%xmm14 + + jmp .L_after_reduction_iuEEnvAblnyuBEp +.L_small_initial_compute_done_iuEEnvAblnyuBEp: +.L_after_reduction_iuEEnvAblnyuBEp: + jmp .L_last_blocks_done_muvbsvrgtnhDwuC +.L_last_num_blocks_is_2_muvbsvrgtnhDwuC: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $254,%r15d + jae .L_16_blocks_overflow_syraAlmuhpzefuz + vpaddd %ymm28,%ymm2,%ymm0 + jmp .L_16_blocks_ok_syraAlmuhpzefuz + +.L_16_blocks_overflow_syraAlmuhpzefuz: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %ymm29,%ymm0,%ymm0 +.L_16_blocks_ok_syraAlmuhpzefuz: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z} + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vaesenclast %ymm30,%ymm0,%ymm0 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %ymm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %ymm29,%ymm0,%ymm17 + vextracti32x4 $1,%zmm17,%xmm7 + subq $16 * (2 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_wklxqcsAiCzEeze + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_wklxqcsAiCzEeze +.L_small_initial_partial_block_wklxqcsAiCzEeze: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_wklxqcsAiCzEeze: + + orq %r8,%r8 + je .L_after_reduction_wklxqcsAiCzEeze + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_wklxqcsAiCzEeze: + jmp .L_last_blocks_done_muvbsvrgtnhDwuC +.L_last_num_blocks_is_3_muvbsvrgtnhDwuC: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $253,%r15d + jae .L_16_blocks_overflow_iccrdFDrrokpmyB + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_iccrdFDrrokpmyB + +.L_16_blocks_overflow_iccrdFDrrokpmyB: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_iccrdFDrrokpmyB: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $2,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vextracti32x4 $2,%zmm17,%xmm7 + subq $16 * (3 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_ohaugBufhhdgdDo + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_ohaugBufhhdgdDo +.L_small_initial_partial_block_ohaugBufhhdgdDo: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_ohaugBufhhdgdDo: + + orq %r8,%r8 + je .L_after_reduction_ohaugBufhhdgdDo + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_ohaugBufhhdgdDo: + jmp .L_last_blocks_done_muvbsvrgtnhDwuC +.L_last_num_blocks_is_4_muvbsvrgtnhDwuC: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $252,%r15d + jae .L_16_blocks_overflow_jkieEplbtgwkEgk + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_jkieEplbtgwkEgk + +.L_16_blocks_overflow_jkieEplbtgwkEgk: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_jkieEplbtgwkEgk: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $3,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vextracti32x4 $3,%zmm17,%xmm7 + subq $16 * (4 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_omkzepGnFhlDsok + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_omkzepGnFhlDsok +.L_small_initial_partial_block_omkzepGnFhlDsok: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_omkzepGnFhlDsok: + + orq %r8,%r8 + je .L_after_reduction_omkzepGnFhlDsok + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_omkzepGnFhlDsok: + jmp .L_last_blocks_done_muvbsvrgtnhDwuC +.L_last_num_blocks_is_5_muvbsvrgtnhDwuC: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $251,%r15d + jae .L_16_blocks_overflow_vtnqanBpwpcCkvb + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %xmm27,%xmm0,%xmm3 + jmp .L_16_blocks_ok_vtnqanBpwpcCkvb + +.L_16_blocks_overflow_vtnqanBpwpcCkvb: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %xmm29,%xmm3,%xmm3 +.L_16_blocks_ok_vtnqanBpwpcCkvb: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %xmm30,%xmm3,%xmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %xmm19,%xmm3,%xmm3 + vextracti32x4 $0,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %xmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %xmm29,%xmm3,%xmm19 + vextracti32x4 $0,%zmm19,%xmm7 + subq $16 * (5 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_DiateEzAgclciak + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_DiateEzAgclciak +.L_small_initial_partial_block_DiateEzAgclciak: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_DiateEzAgclciak: + + orq %r8,%r8 + je .L_after_reduction_DiateEzAgclciak + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_DiateEzAgclciak: + jmp .L_last_blocks_done_muvbsvrgtnhDwuC +.L_last_num_blocks_is_6_muvbsvrgtnhDwuC: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $250,%r15d + jae .L_16_blocks_overflow_oakjAwsnClAznod + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %ymm27,%ymm0,%ymm3 + jmp .L_16_blocks_ok_oakjAwsnClAznod + +.L_16_blocks_overflow_oakjAwsnClAznod: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %ymm29,%ymm3,%ymm3 +.L_16_blocks_ok_oakjAwsnClAznod: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %ymm30,%ymm3,%ymm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %ymm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %ymm29,%ymm3,%ymm19 + vextracti32x4 $1,%zmm19,%xmm7 + subq $16 * (6 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_oqCwqiEfmwxEduu + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_oqCwqiEfmwxEduu +.L_small_initial_partial_block_oqCwqiEfmwxEduu: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_oqCwqiEfmwxEduu: + + orq %r8,%r8 + je .L_after_reduction_oqCwqiEfmwxEduu + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_oqCwqiEfmwxEduu: + jmp .L_last_blocks_done_muvbsvrgtnhDwuC +.L_last_num_blocks_is_7_muvbsvrgtnhDwuC: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $249,%r15d + jae .L_16_blocks_overflow_lhrubptnEwwxvoi + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_lhrubptnEwwxvoi + +.L_16_blocks_overflow_lhrubptnEwwxvoi: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_lhrubptnEwwxvoi: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $2,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vextracti32x4 $2,%zmm19,%xmm7 + subq $16 * (7 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_lyGDbaegdAnFgEy + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_lyGDbaegdAnFgEy +.L_small_initial_partial_block_lyGDbaegdAnFgEy: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_lyGDbaegdAnFgEy: + + orq %r8,%r8 + je .L_after_reduction_lyGDbaegdAnFgEy + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_lyGDbaegdAnFgEy: + jmp .L_last_blocks_done_muvbsvrgtnhDwuC +.L_last_num_blocks_is_8_muvbsvrgtnhDwuC: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $248,%r15d + jae .L_16_blocks_overflow_umvkbciEsdgFrgg + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_umvkbciEsdgFrgg + +.L_16_blocks_overflow_umvkbciEsdgFrgg: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_umvkbciEsdgFrgg: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $3,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vextracti32x4 $3,%zmm19,%xmm7 + subq $16 * (8 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_ogfGBxxhhoalgtB + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_ogfGBxxhhoalgtB +.L_small_initial_partial_block_ogfGBxxhhoalgtB: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_ogfGBxxhhoalgtB: + + orq %r8,%r8 + je .L_after_reduction_ogfGBxxhhoalgtB + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_ogfGBxxhhoalgtB: + jmp .L_last_blocks_done_muvbsvrgtnhDwuC +.L_last_num_blocks_is_9_muvbsvrgtnhDwuC: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $247,%r15d + jae .L_16_blocks_overflow_wFkatvuEtupbkGb + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %xmm27,%xmm3,%xmm4 + jmp .L_16_blocks_ok_wFkatvuEtupbkGb + +.L_16_blocks_overflow_wFkatvuEtupbkGb: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %xmm29,%xmm4,%xmm4 +.L_16_blocks_ok_wFkatvuEtupbkGb: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %xmm30,%xmm4,%xmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %xmm20,%xmm4,%xmm4 + vextracti32x4 $0,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %xmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %xmm29,%xmm4,%xmm20 + vextracti32x4 $0,%zmm20,%xmm7 + subq $16 * (9 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_wkiizpjcpbzfFyj + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_wkiizpjcpbzfFyj +.L_small_initial_partial_block_wkiizpjcpbzfFyj: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_wkiizpjcpbzfFyj: + + orq %r8,%r8 + je .L_after_reduction_wkiizpjcpbzfFyj + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_wkiizpjcpbzfFyj: + jmp .L_last_blocks_done_muvbsvrgtnhDwuC +.L_last_num_blocks_is_10_muvbsvrgtnhDwuC: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $246,%r15d + jae .L_16_blocks_overflow_ircelvtBaeuiwvC + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %ymm27,%ymm3,%ymm4 + jmp .L_16_blocks_ok_ircelvtBaeuiwvC + +.L_16_blocks_overflow_ircelvtBaeuiwvC: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %ymm29,%ymm4,%ymm4 +.L_16_blocks_ok_ircelvtBaeuiwvC: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %ymm30,%ymm4,%ymm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %ymm20,%ymm4,%ymm4 + vextracti32x4 $1,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %ymm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %ymm29,%ymm4,%ymm20 + vextracti32x4 $1,%zmm20,%xmm7 + subq $16 * (10 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_pDtuuFvFlvjvrCz + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_pDtuuFvFlvjvrCz +.L_small_initial_partial_block_pDtuuFvFlvjvrCz: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_pDtuuFvFlvjvrCz: + + orq %r8,%r8 + je .L_after_reduction_pDtuuFvFlvjvrCz + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_pDtuuFvFlvjvrCz: + jmp .L_last_blocks_done_muvbsvrgtnhDwuC +.L_last_num_blocks_is_11_muvbsvrgtnhDwuC: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $245,%r15d + jae .L_16_blocks_overflow_GozdsctAidzEqxd + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_GozdsctAidzEqxd + +.L_16_blocks_overflow_GozdsctAidzEqxd: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_GozdsctAidzEqxd: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $2,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vextracti32x4 $2,%zmm20,%xmm7 + subq $16 * (11 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_yrocgFvryFBiech + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_yrocgFvryFBiech +.L_small_initial_partial_block_yrocgFvryFBiech: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_yrocgFvryFBiech: + + orq %r8,%r8 + je .L_after_reduction_yrocgFvryFBiech + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_yrocgFvryFBiech: + jmp .L_last_blocks_done_muvbsvrgtnhDwuC +.L_last_num_blocks_is_12_muvbsvrgtnhDwuC: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $244,%r15d + jae .L_16_blocks_overflow_kgvcyifhjuAglsm + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_kgvcyifhjuAglsm + +.L_16_blocks_overflow_kgvcyifhjuAglsm: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_kgvcyifhjuAglsm: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $3,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vextracti32x4 $3,%zmm20,%xmm7 + subq $16 * (12 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_oclBtelgDoBblti + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_oclBtelgDoBblti +.L_small_initial_partial_block_oclBtelgDoBblti: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_oclBtelgDoBblti: + + orq %r8,%r8 + je .L_after_reduction_oclBtelgDoBblti + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_oclBtelgDoBblti: + jmp .L_last_blocks_done_muvbsvrgtnhDwuC +.L_last_num_blocks_is_13_muvbsvrgtnhDwuC: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $243,%r15d + jae .L_16_blocks_overflow_GgsgulfrbGGFGGc + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %xmm27,%xmm4,%xmm5 + jmp .L_16_blocks_ok_GgsgulfrbGGFGGc + +.L_16_blocks_overflow_GgsgulfrbGGFGGc: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %xmm29,%xmm5,%xmm5 +.L_16_blocks_ok_GgsgulfrbGGFGGc: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %xmm30,%xmm5,%xmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %xmm21,%xmm5,%xmm5 + vextracti32x4 $0,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %xmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %xmm29,%xmm5,%xmm21 + vextracti32x4 $0,%zmm21,%xmm7 + subq $16 * (13 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_bvEBvhpbxzwvDrk + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_bvEBvhpbxzwvDrk +.L_small_initial_partial_block_bvEBvhpbxzwvDrk: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_bvEBvhpbxzwvDrk: + + orq %r8,%r8 + je .L_after_reduction_bvEBvhpbxzwvDrk + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_bvEBvhpbxzwvDrk: + jmp .L_last_blocks_done_muvbsvrgtnhDwuC +.L_last_num_blocks_is_14_muvbsvrgtnhDwuC: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $242,%r15d + jae .L_16_blocks_overflow_vejDBlGzdxbDGDE + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %ymm27,%ymm4,%ymm5 + jmp .L_16_blocks_ok_vejDBlGzdxbDGDE + +.L_16_blocks_overflow_vejDBlGzdxbDGDE: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %ymm29,%ymm5,%ymm5 +.L_16_blocks_ok_vejDBlGzdxbDGDE: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %ymm30,%ymm5,%ymm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %ymm21,%ymm5,%ymm5 + vextracti32x4 $1,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %ymm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %ymm29,%ymm5,%ymm21 + vextracti32x4 $1,%zmm21,%xmm7 + subq $16 * (14 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_lvCGeChuoEvfnul + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_lvCGeChuoEvfnul +.L_small_initial_partial_block_lvCGeChuoEvfnul: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_lvCGeChuoEvfnul: + + orq %r8,%r8 + je .L_after_reduction_lvCGeChuoEvfnul + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_lvCGeChuoEvfnul: + jmp .L_last_blocks_done_muvbsvrgtnhDwuC +.L_last_num_blocks_is_15_muvbsvrgtnhDwuC: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $241,%r15d + jae .L_16_blocks_overflow_ytioEdspdkiwstn + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_ytioEdspdkiwstn + +.L_16_blocks_overflow_ytioEdspdkiwstn: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_ytioEdspdkiwstn: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $2,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %zmm29,%zmm5,%zmm21 + vextracti32x4 $2,%zmm21,%xmm7 + subq $16 * (15 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_fxpoudCxsjlwBmb + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_fxpoudCxsjlwBmb +.L_small_initial_partial_block_fxpoudCxsjlwBmb: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_fxpoudCxsjlwBmb: + + orq %r8,%r8 + je .L_after_reduction_fxpoudCxsjlwBmb + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_fxpoudCxsjlwBmb: + jmp .L_last_blocks_done_muvbsvrgtnhDwuC +.L_last_num_blocks_is_16_muvbsvrgtnhDwuC: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $240,%r15d + jae .L_16_blocks_overflow_ijwokgwDeCteCll + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_ijwokgwDeCteCll + +.L_16_blocks_overflow_ijwokgwDeCteCll: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_ijwokgwDeCteCll: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $3,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %zmm29,%zmm5,%zmm21 + vextracti32x4 $3,%zmm21,%xmm7 + subq $16 * (16 - 1),%r8 +.L_small_initial_partial_block_rCCuFewyfDAEddb: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_rCCuFewyfDAEddb: + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_rCCuFewyfDAEddb: + jmp .L_last_blocks_done_muvbsvrgtnhDwuC +.L_last_num_blocks_is_0_muvbsvrgtnhDwuC: + vmovdqa64 768(%rsp),%zmm13 + vpxorq %zmm14,%zmm13,%zmm13 + vmovdqu64 0(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 832(%rsp),%zmm13 + vmovdqu64 64(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpxorq %zmm10,%zmm4,%zmm26 + vpxorq %zmm6,%zmm0,%zmm24 + vpxorq %zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 896(%rsp),%zmm13 + vmovdqu64 128(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 960(%rsp),%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm4 + + + vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0 + vpslldq $8,%xmm0,%xmm0 + vpxorq %xmm0,%xmm25,%xmm0 + + + vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3 + vpsrldq $4,%xmm3,%xmm3 + vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm3,%xmm14 + +.L_last_blocks_done_muvbsvrgtnhDwuC: + vpshufb %xmm29,%xmm2,%xmm2 + jmp .L_ghash_done_eawnuBpGmxcBoDC +.L_encrypt_16_blocks_eawnuBpGmxcBoDC: + cmpb $240,%r15b + jae .L_16_blocks_overflow_nAxplcgfimbFyBh + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_nAxplcgfimbFyBh +.L_16_blocks_overflow_nAxplcgfimbFyBh: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_nAxplcgfimbFyBh: + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp),%zmm1 + + + + + vshufi64x2 $255,%zmm5,%zmm5,%zmm2 + addb $16,%r15b + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + + + + + + + + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm6 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + + + + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm15,%zmm10,%zmm26 + vpxorq %zmm12,%zmm6,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 208(%rdi),%zmm31 + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 224(%rdi),%zmm30 + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + + + + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + + + + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1) + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 + vmovdqa64 %zmm0,1280(%rsp) + vmovdqa64 %zmm3,1344(%rsp) + vmovdqa64 %zmm4,1408(%rsp) + vmovdqa64 %zmm5,1472(%rsp) + vmovdqa64 1024(%rsp),%zmm13 + vmovdqu64 256(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1088(%rsp),%zmm13 + vmovdqu64 320(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 1152(%rsp),%zmm13 + vmovdqu64 384(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1216(%rsp),%zmm13 + vmovdqu64 448(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + subq $256,%r8 + addq $256,%r11 + movl %r8d,%r10d + addl $15,%r10d + shrl $4,%r10d + je .L_last_num_blocks_is_0_gFFyhgntvwxgCvF + + cmpl $8,%r10d + je .L_last_num_blocks_is_8_gFFyhgntvwxgCvF + jb .L_last_num_blocks_is_7_1_gFFyhgntvwxgCvF + + + cmpl $12,%r10d + je .L_last_num_blocks_is_12_gFFyhgntvwxgCvF + jb .L_last_num_blocks_is_11_9_gFFyhgntvwxgCvF + + + cmpl $15,%r10d + je .L_last_num_blocks_is_15_gFFyhgntvwxgCvF + ja .L_last_num_blocks_is_16_gFFyhgntvwxgCvF + cmpl $14,%r10d + je .L_last_num_blocks_is_14_gFFyhgntvwxgCvF + jmp .L_last_num_blocks_is_13_gFFyhgntvwxgCvF + +.L_last_num_blocks_is_11_9_gFFyhgntvwxgCvF: + + cmpl $10,%r10d + je .L_last_num_blocks_is_10_gFFyhgntvwxgCvF + ja .L_last_num_blocks_is_11_gFFyhgntvwxgCvF + jmp .L_last_num_blocks_is_9_gFFyhgntvwxgCvF + +.L_last_num_blocks_is_7_1_gFFyhgntvwxgCvF: + cmpl $4,%r10d + je .L_last_num_blocks_is_4_gFFyhgntvwxgCvF + jb .L_last_num_blocks_is_3_1_gFFyhgntvwxgCvF + + cmpl $6,%r10d + ja .L_last_num_blocks_is_7_gFFyhgntvwxgCvF + je .L_last_num_blocks_is_6_gFFyhgntvwxgCvF + jmp .L_last_num_blocks_is_5_gFFyhgntvwxgCvF + +.L_last_num_blocks_is_3_1_gFFyhgntvwxgCvF: + + cmpl $2,%r10d + ja .L_last_num_blocks_is_3_gFFyhgntvwxgCvF + je .L_last_num_blocks_is_2_gFFyhgntvwxgCvF +.L_last_num_blocks_is_1_gFFyhgntvwxgCvF: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $255,%r15d + jae .L_16_blocks_overflow_edqyFiqozsDenuz + vpaddd %xmm28,%xmm2,%xmm0 + jmp .L_16_blocks_ok_edqyFiqozsDenuz + +.L_16_blocks_overflow_edqyFiqozsDenuz: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %xmm29,%xmm0,%xmm0 +.L_16_blocks_ok_edqyFiqozsDenuz: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $0,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z} + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %xmm31,%xmm0,%xmm0 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %xmm30,%xmm0,%xmm0 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti32x4 $0,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %xmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %xmm29,%xmm0,%xmm17 + vextracti32x4 $0,%zmm17,%xmm7 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_hxBDgFwdGwbthCy + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_hxBDgFwdGwbthCy +.L_small_initial_partial_block_hxBDgFwdGwbthCy: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + + + + + + + + + + + + vpxorq %xmm7,%xmm14,%xmm14 + + jmp .L_after_reduction_hxBDgFwdGwbthCy +.L_small_initial_compute_done_hxBDgFwdGwbthCy: +.L_after_reduction_hxBDgFwdGwbthCy: + jmp .L_last_blocks_done_gFFyhgntvwxgCvF +.L_last_num_blocks_is_2_gFFyhgntvwxgCvF: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $254,%r15d + jae .L_16_blocks_overflow_uyuBmtkqzsrxAjG + vpaddd %ymm28,%ymm2,%ymm0 + jmp .L_16_blocks_ok_uyuBmtkqzsrxAjG + +.L_16_blocks_overflow_uyuBmtkqzsrxAjG: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %ymm29,%ymm0,%ymm0 +.L_16_blocks_ok_uyuBmtkqzsrxAjG: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $1,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z} + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %ymm31,%ymm0,%ymm0 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %ymm30,%ymm0,%ymm0 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %ymm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %ymm29,%ymm0,%ymm17 + vextracti32x4 $1,%zmm17,%xmm7 + subq $16 * (2 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_DnwnjmmqBtjmtxy + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_DnwnjmmqBtjmtxy +.L_small_initial_partial_block_DnwnjmmqBtjmtxy: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_DnwnjmmqBtjmtxy: + + orq %r8,%r8 + je .L_after_reduction_DnwnjmmqBtjmtxy + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_DnwnjmmqBtjmtxy: + jmp .L_last_blocks_done_gFFyhgntvwxgCvF +.L_last_num_blocks_is_3_gFFyhgntvwxgCvF: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $253,%r15d + jae .L_16_blocks_overflow_mayxFbwAyisdwiE + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_mayxFbwAyisdwiE + +.L_16_blocks_overflow_mayxFbwAyisdwiE: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_mayxFbwAyisdwiE: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $2,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $2,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vextracti32x4 $2,%zmm17,%xmm7 + subq $16 * (3 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_sFnrdciEorxGldB + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_sFnrdciEorxGldB +.L_small_initial_partial_block_sFnrdciEorxGldB: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_sFnrdciEorxGldB: + + orq %r8,%r8 + je .L_after_reduction_sFnrdciEorxGldB + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_sFnrdciEorxGldB: + jmp .L_last_blocks_done_gFFyhgntvwxgCvF +.L_last_num_blocks_is_4_gFFyhgntvwxgCvF: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $252,%r15d + jae .L_16_blocks_overflow_cahBhluzDpDniBC + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_cahBhluzDpDniBC + +.L_16_blocks_overflow_cahBhluzDpDniBC: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_cahBhluzDpDniBC: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $3,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $3,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vextracti32x4 $3,%zmm17,%xmm7 + subq $16 * (4 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_flBuFDkGEouCjry + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_flBuFDkGEouCjry +.L_small_initial_partial_block_flBuFDkGEouCjry: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_flBuFDkGEouCjry: + + orq %r8,%r8 + je .L_after_reduction_flBuFDkGEouCjry + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_flBuFDkGEouCjry: + jmp .L_last_blocks_done_gFFyhgntvwxgCvF +.L_last_num_blocks_is_5_gFFyhgntvwxgCvF: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $251,%r15d + jae .L_16_blocks_overflow_dogBbFBCkktqmfE + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %xmm27,%xmm0,%xmm3 + jmp .L_16_blocks_ok_dogBbFBCkktqmfE + +.L_16_blocks_overflow_dogBbFBCkktqmfE: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %xmm29,%xmm3,%xmm3 +.L_16_blocks_ok_dogBbFBCkktqmfE: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $0,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %xmm30,%xmm3,%xmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %xmm19,%xmm3,%xmm3 + vextracti32x4 $0,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %xmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %xmm29,%xmm3,%xmm19 + vextracti32x4 $0,%zmm19,%xmm7 + subq $16 * (5 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_BcpothbedDEfeoC + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_BcpothbedDEfeoC +.L_small_initial_partial_block_BcpothbedDEfeoC: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_BcpothbedDEfeoC: + + orq %r8,%r8 + je .L_after_reduction_BcpothbedDEfeoC + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_BcpothbedDEfeoC: + jmp .L_last_blocks_done_gFFyhgntvwxgCvF +.L_last_num_blocks_is_6_gFFyhgntvwxgCvF: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $250,%r15d + jae .L_16_blocks_overflow_oGartozfntEBpal + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %ymm27,%ymm0,%ymm3 + jmp .L_16_blocks_ok_oGartozfntEBpal + +.L_16_blocks_overflow_oGartozfntEBpal: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %ymm29,%ymm3,%ymm3 +.L_16_blocks_ok_oGartozfntEBpal: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $1,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %ymm30,%ymm3,%ymm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %ymm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %ymm29,%ymm3,%ymm19 + vextracti32x4 $1,%zmm19,%xmm7 + subq $16 * (6 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_rwznrbbsqxwaCko + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_rwznrbbsqxwaCko +.L_small_initial_partial_block_rwznrbbsqxwaCko: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_rwznrbbsqxwaCko: + + orq %r8,%r8 + je .L_after_reduction_rwznrbbsqxwaCko + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_rwznrbbsqxwaCko: + jmp .L_last_blocks_done_gFFyhgntvwxgCvF +.L_last_num_blocks_is_7_gFFyhgntvwxgCvF: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $249,%r15d + jae .L_16_blocks_overflow_EBiardhujGzcrlk + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_EBiardhujGzcrlk + +.L_16_blocks_overflow_EBiardhujGzcrlk: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_EBiardhujGzcrlk: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $2,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $2,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vextracti32x4 $2,%zmm19,%xmm7 + subq $16 * (7 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_tnvletidFAfbEDF + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_tnvletidFAfbEDF +.L_small_initial_partial_block_tnvletidFAfbEDF: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_tnvletidFAfbEDF: + + orq %r8,%r8 + je .L_after_reduction_tnvletidFAfbEDF + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_tnvletidFAfbEDF: + jmp .L_last_blocks_done_gFFyhgntvwxgCvF +.L_last_num_blocks_is_8_gFFyhgntvwxgCvF: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $248,%r15d + jae .L_16_blocks_overflow_iumqnFogzhcrGGw + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_iumqnFogzhcrGGw + +.L_16_blocks_overflow_iumqnFogzhcrGGw: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_iumqnFogzhcrGGw: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $3,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $3,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vextracti32x4 $3,%zmm19,%xmm7 + subq $16 * (8 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_qEzaCAhsCAiFoFG + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_qEzaCAhsCAiFoFG +.L_small_initial_partial_block_qEzaCAhsCAiFoFG: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_qEzaCAhsCAiFoFG: + + orq %r8,%r8 + je .L_after_reduction_qEzaCAhsCAiFoFG + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_qEzaCAhsCAiFoFG: + jmp .L_last_blocks_done_gFFyhgntvwxgCvF +.L_last_num_blocks_is_9_gFFyhgntvwxgCvF: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $247,%r15d + jae .L_16_blocks_overflow_uerldGeDtdqniAd + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %xmm27,%xmm3,%xmm4 + jmp .L_16_blocks_ok_uerldGeDtdqniAd + +.L_16_blocks_overflow_uerldGeDtdqniAd: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %xmm29,%xmm4,%xmm4 +.L_16_blocks_ok_uerldGeDtdqniAd: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $0,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %xmm30,%xmm4,%xmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %xmm20,%xmm4,%xmm4 + vextracti32x4 $0,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %xmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %xmm29,%xmm4,%xmm20 + vextracti32x4 $0,%zmm20,%xmm7 + subq $16 * (9 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_aaFGCaaBiGmkrxE + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_aaFGCaaBiGmkrxE +.L_small_initial_partial_block_aaFGCaaBiGmkrxE: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_aaFGCaaBiGmkrxE: + + orq %r8,%r8 + je .L_after_reduction_aaFGCaaBiGmkrxE + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_aaFGCaaBiGmkrxE: + jmp .L_last_blocks_done_gFFyhgntvwxgCvF +.L_last_num_blocks_is_10_gFFyhgntvwxgCvF: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $246,%r15d + jae .L_16_blocks_overflow_Aozpqcpomafvkzu + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %ymm27,%ymm3,%ymm4 + jmp .L_16_blocks_ok_Aozpqcpomafvkzu + +.L_16_blocks_overflow_Aozpqcpomafvkzu: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %ymm29,%ymm4,%ymm4 +.L_16_blocks_ok_Aozpqcpomafvkzu: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $1,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %ymm30,%ymm4,%ymm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %ymm20,%ymm4,%ymm4 + vextracti32x4 $1,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %ymm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %ymm29,%ymm4,%ymm20 + vextracti32x4 $1,%zmm20,%xmm7 + subq $16 * (10 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_dahhcFmAhdipFgB + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_dahhcFmAhdipFgB +.L_small_initial_partial_block_dahhcFmAhdipFgB: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_dahhcFmAhdipFgB: + + orq %r8,%r8 + je .L_after_reduction_dahhcFmAhdipFgB + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_dahhcFmAhdipFgB: + jmp .L_last_blocks_done_gFFyhgntvwxgCvF +.L_last_num_blocks_is_11_gFFyhgntvwxgCvF: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $245,%r15d + jae .L_16_blocks_overflow_EgocqAvvFflyEjg + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_EgocqAvvFflyEjg + +.L_16_blocks_overflow_EgocqAvvFflyEjg: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_EgocqAvvFflyEjg: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $2,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $2,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vextracti32x4 $2,%zmm20,%xmm7 + subq $16 * (11 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_BgCerdsyeobnbbs + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_BgCerdsyeobnbbs +.L_small_initial_partial_block_BgCerdsyeobnbbs: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_BgCerdsyeobnbbs: + + orq %r8,%r8 + je .L_after_reduction_BgCerdsyeobnbbs + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_BgCerdsyeobnbbs: + jmp .L_last_blocks_done_gFFyhgntvwxgCvF +.L_last_num_blocks_is_12_gFFyhgntvwxgCvF: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $244,%r15d + jae .L_16_blocks_overflow_cydmoiBEzigfGjF + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_cydmoiBEzigfGjF + +.L_16_blocks_overflow_cydmoiBEzigfGjF: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_cydmoiBEzigfGjF: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $3,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $3,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vextracti32x4 $3,%zmm20,%xmm7 + subq $16 * (12 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_cDdypaAhkmGvFrB + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_cDdypaAhkmGvFrB +.L_small_initial_partial_block_cDdypaAhkmGvFrB: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_cDdypaAhkmGvFrB: + + orq %r8,%r8 + je .L_after_reduction_cDdypaAhkmGvFrB + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_cDdypaAhkmGvFrB: + jmp .L_last_blocks_done_gFFyhgntvwxgCvF +.L_last_num_blocks_is_13_gFFyhgntvwxgCvF: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $243,%r15d + jae .L_16_blocks_overflow_cGnAhGixtCoyetC + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %xmm27,%xmm4,%xmm5 + jmp .L_16_blocks_ok_cGnAhGixtCoyetC + +.L_16_blocks_overflow_cGnAhGixtCoyetC: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %xmm29,%xmm5,%xmm5 +.L_16_blocks_ok_cGnAhGixtCoyetC: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $0,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %xmm30,%xmm5,%xmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %xmm21,%xmm5,%xmm5 + vextracti32x4 $0,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %xmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %xmm29,%xmm5,%xmm21 + vextracti32x4 $0,%zmm21,%xmm7 + subq $16 * (13 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_FeGcnwBvApiyeqj + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_FeGcnwBvApiyeqj +.L_small_initial_partial_block_FeGcnwBvApiyeqj: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_FeGcnwBvApiyeqj: + + orq %r8,%r8 + je .L_after_reduction_FeGcnwBvApiyeqj + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_FeGcnwBvApiyeqj: + jmp .L_last_blocks_done_gFFyhgntvwxgCvF +.L_last_num_blocks_is_14_gFFyhgntvwxgCvF: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $242,%r15d + jae .L_16_blocks_overflow_iftBfEFqGGBvyjm + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %ymm27,%ymm4,%ymm5 + jmp .L_16_blocks_ok_iftBfEFqGGBvyjm + +.L_16_blocks_overflow_iftBfEFqGGBvyjm: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %ymm29,%ymm5,%ymm5 +.L_16_blocks_ok_iftBfEFqGGBvyjm: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $1,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %ymm30,%ymm5,%ymm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %ymm21,%ymm5,%ymm5 + vextracti32x4 $1,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %ymm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %ymm29,%ymm5,%ymm21 + vextracti32x4 $1,%zmm21,%xmm7 + subq $16 * (14 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_oihhuqgdwBFgleb + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_oihhuqgdwBFgleb +.L_small_initial_partial_block_oihhuqgdwBFgleb: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_oihhuqgdwBFgleb: + + orq %r8,%r8 + je .L_after_reduction_oihhuqgdwBFgleb + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_oihhuqgdwBFgleb: + jmp .L_last_blocks_done_gFFyhgntvwxgCvF +.L_last_num_blocks_is_15_gFFyhgntvwxgCvF: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $241,%r15d + jae .L_16_blocks_overflow_fvupeAvimjnmGoe + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_fvupeAvimjnmGoe + +.L_16_blocks_overflow_fvupeAvimjnmGoe: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_fvupeAvimjnmGoe: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $2,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $2,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %zmm29,%zmm5,%zmm21 + vextracti32x4 $2,%zmm21,%xmm7 + subq $16 * (15 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_rrptnxnCqernCsp + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_rrptnxnCqernCsp +.L_small_initial_partial_block_rrptnxnCqernCsp: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_rrptnxnCqernCsp: + + orq %r8,%r8 + je .L_after_reduction_rrptnxnCqernCsp + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_rrptnxnCqernCsp: + jmp .L_last_blocks_done_gFFyhgntvwxgCvF +.L_last_num_blocks_is_16_gFFyhgntvwxgCvF: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $240,%r15d + jae .L_16_blocks_overflow_wGkryszirehgiqf + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_wGkryszirehgiqf + +.L_16_blocks_overflow_wGkryszirehgiqf: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_wGkryszirehgiqf: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $3,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $3,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %zmm29,%zmm5,%zmm21 + vextracti32x4 $3,%zmm21,%xmm7 + subq $16 * (16 - 1),%r8 +.L_small_initial_partial_block_ylCxcFDbnxrlyjy: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_ylCxcFDbnxrlyjy: + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_ylCxcFDbnxrlyjy: + jmp .L_last_blocks_done_gFFyhgntvwxgCvF +.L_last_num_blocks_is_0_gFFyhgntvwxgCvF: + vmovdqa64 1280(%rsp),%zmm13 + vmovdqu64 512(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1344(%rsp),%zmm13 + vmovdqu64 576(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 1408(%rsp),%zmm13 + vmovdqu64 640(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1472(%rsp),%zmm13 + vmovdqu64 704(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm4 + + + vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0 + vpslldq $8,%xmm0,%xmm0 + vpxorq %xmm0,%xmm25,%xmm0 + + + vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3 + vpsrldq $4,%xmm3,%xmm3 + vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm3,%xmm14 + +.L_last_blocks_done_gFFyhgntvwxgCvF: + vpshufb %xmm29,%xmm2,%xmm2 + jmp .L_ghash_done_eawnuBpGmxcBoDC + +.L_message_below_32_blocks_eawnuBpGmxcBoDC: + + + subq $256,%r8 + addq $256,%r11 + movl %r8d,%r10d + testq %r14,%r14 + jnz .L_skip_hkeys_precomputation_cyGhsoclCDuqust + vmovdqu64 640(%rsp),%zmm3 + + + vshufi64x2 $0x00,%zmm3,%zmm3,%zmm3 + + vmovdqu64 576(%rsp),%zmm4 + vmovdqu64 512(%rsp),%zmm5 + + vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4 + vpxorq %zmm10,%zmm4,%zmm4 + + vpsrldq $8,%zmm4,%zmm10 + vpslldq $8,%zmm4,%zmm4 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4 + vpslldq $4,%zmm4,%zmm4 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm4 + + vmovdqu64 %zmm4,448(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5 + vpxorq %zmm10,%zmm5,%zmm5 + + vpsrldq $8,%zmm5,%zmm10 + vpslldq $8,%zmm5,%zmm5 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5 + vpslldq $4,%zmm5,%zmm5 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm5 + + vmovdqu64 %zmm5,384(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4 + vpxorq %zmm10,%zmm4,%zmm4 + + vpsrldq $8,%zmm4,%zmm10 + vpslldq $8,%zmm4,%zmm4 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4 + vpslldq $4,%zmm4,%zmm4 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm4 + + vmovdqu64 %zmm4,320(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5 + vpxorq %zmm10,%zmm5,%zmm5 + + vpsrldq $8,%zmm5,%zmm10 + vpslldq $8,%zmm5,%zmm5 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5 + vpslldq $4,%zmm5,%zmm5 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm5 + + vmovdqu64 %zmm5,256(%rsp) +.L_skip_hkeys_precomputation_cyGhsoclCDuqust: + movq $1,%r14 + andl $~15,%r10d + movl $512,%ebx + subl %r10d,%ebx + movl %r8d,%r10d + addl $15,%r10d + shrl $4,%r10d + je .L_last_num_blocks_is_0_gmjFjaoGnEhAquD + + cmpl $8,%r10d + je .L_last_num_blocks_is_8_gmjFjaoGnEhAquD + jb .L_last_num_blocks_is_7_1_gmjFjaoGnEhAquD + + + cmpl $12,%r10d + je .L_last_num_blocks_is_12_gmjFjaoGnEhAquD + jb .L_last_num_blocks_is_11_9_gmjFjaoGnEhAquD + + + cmpl $15,%r10d + je .L_last_num_blocks_is_15_gmjFjaoGnEhAquD + ja .L_last_num_blocks_is_16_gmjFjaoGnEhAquD + cmpl $14,%r10d + je .L_last_num_blocks_is_14_gmjFjaoGnEhAquD + jmp .L_last_num_blocks_is_13_gmjFjaoGnEhAquD + +.L_last_num_blocks_is_11_9_gmjFjaoGnEhAquD: + + cmpl $10,%r10d + je .L_last_num_blocks_is_10_gmjFjaoGnEhAquD + ja .L_last_num_blocks_is_11_gmjFjaoGnEhAquD + jmp .L_last_num_blocks_is_9_gmjFjaoGnEhAquD + +.L_last_num_blocks_is_7_1_gmjFjaoGnEhAquD: + cmpl $4,%r10d + je .L_last_num_blocks_is_4_gmjFjaoGnEhAquD + jb .L_last_num_blocks_is_3_1_gmjFjaoGnEhAquD + + cmpl $6,%r10d + ja .L_last_num_blocks_is_7_gmjFjaoGnEhAquD + je .L_last_num_blocks_is_6_gmjFjaoGnEhAquD + jmp .L_last_num_blocks_is_5_gmjFjaoGnEhAquD + +.L_last_num_blocks_is_3_1_gmjFjaoGnEhAquD: + + cmpl $2,%r10d + ja .L_last_num_blocks_is_3_gmjFjaoGnEhAquD + je .L_last_num_blocks_is_2_gmjFjaoGnEhAquD +.L_last_num_blocks_is_1_gmjFjaoGnEhAquD: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $255,%r15d + jae .L_16_blocks_overflow_lmprlxqohayAaff + vpaddd %xmm28,%xmm2,%xmm0 + jmp .L_16_blocks_ok_lmprlxqohayAaff + +.L_16_blocks_overflow_lmprlxqohayAaff: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %xmm29,%xmm0,%xmm0 +.L_16_blocks_ok_lmprlxqohayAaff: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z} + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vaesenclast %xmm30,%xmm0,%xmm0 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti32x4 $0,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %xmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %xmm29,%xmm0,%xmm17 + vextracti32x4 $0,%zmm17,%xmm7 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_ycnbantiDaoGCva + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_ycnbantiDaoGCva +.L_small_initial_partial_block_ycnbantiDaoGCva: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm0 + + + vpclmulqdq $0x01,%xmm25,%xmm0,%xmm3 + vpslldq $8,%xmm3,%xmm3 + vpxorq %xmm3,%xmm25,%xmm3 + + + vpclmulqdq $0x00,%xmm3,%xmm0,%xmm4 + vpsrldq $4,%xmm4,%xmm4 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm4,%xmm14 + + + + + + + + + + + + + vpxorq %xmm7,%xmm14,%xmm14 + + jmp .L_after_reduction_ycnbantiDaoGCva +.L_small_initial_compute_done_ycnbantiDaoGCva: +.L_after_reduction_ycnbantiDaoGCva: + jmp .L_last_blocks_done_gmjFjaoGnEhAquD +.L_last_num_blocks_is_2_gmjFjaoGnEhAquD: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $254,%r15d + jae .L_16_blocks_overflow_FmnmcFgtBcispji + vpaddd %ymm28,%ymm2,%ymm0 + jmp .L_16_blocks_ok_FmnmcFgtBcispji + +.L_16_blocks_overflow_FmnmcFgtBcispji: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %ymm29,%ymm0,%ymm0 +.L_16_blocks_ok_FmnmcFgtBcispji: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z} + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vaesenclast %ymm30,%ymm0,%ymm0 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %ymm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %ymm29,%ymm0,%ymm17 + vextracti32x4 $1,%zmm17,%xmm7 + subq $16 * (2 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_AtjvciobwAfsBgo + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_AtjvciobwAfsBgo +.L_small_initial_partial_block_AtjvciobwAfsBgo: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_AtjvciobwAfsBgo: + + orq %r8,%r8 + je .L_after_reduction_AtjvciobwAfsBgo + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_AtjvciobwAfsBgo: + jmp .L_last_blocks_done_gmjFjaoGnEhAquD +.L_last_num_blocks_is_3_gmjFjaoGnEhAquD: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $253,%r15d + jae .L_16_blocks_overflow_tgAkxvFFocitubl + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_tgAkxvFFocitubl + +.L_16_blocks_overflow_tgAkxvFFocitubl: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_tgAkxvFFocitubl: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $2,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vextracti32x4 $2,%zmm17,%xmm7 + subq $16 * (3 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_siwDojaimuxlcux + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_siwDojaimuxlcux +.L_small_initial_partial_block_siwDojaimuxlcux: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_siwDojaimuxlcux: + + orq %r8,%r8 + je .L_after_reduction_siwDojaimuxlcux + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_siwDojaimuxlcux: + jmp .L_last_blocks_done_gmjFjaoGnEhAquD +.L_last_num_blocks_is_4_gmjFjaoGnEhAquD: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $252,%r15d + jae .L_16_blocks_overflow_AaBBmAybFatffyg + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_AaBBmAybFatffyg + +.L_16_blocks_overflow_AaBBmAybFatffyg: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_AaBBmAybFatffyg: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $3,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vextracti32x4 $3,%zmm17,%xmm7 + subq $16 * (4 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_xhaBeCiyfAeqaBf + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_xhaBeCiyfAeqaBf +.L_small_initial_partial_block_xhaBeCiyfAeqaBf: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_xhaBeCiyfAeqaBf: + + orq %r8,%r8 + je .L_after_reduction_xhaBeCiyfAeqaBf + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_xhaBeCiyfAeqaBf: + jmp .L_last_blocks_done_gmjFjaoGnEhAquD +.L_last_num_blocks_is_5_gmjFjaoGnEhAquD: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $251,%r15d + jae .L_16_blocks_overflow_akmmkrkgrAtqDyf + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %xmm27,%xmm0,%xmm3 + jmp .L_16_blocks_ok_akmmkrkgrAtqDyf + +.L_16_blocks_overflow_akmmkrkgrAtqDyf: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %xmm29,%xmm3,%xmm3 +.L_16_blocks_ok_akmmkrkgrAtqDyf: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %xmm30,%xmm3,%xmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %xmm19,%xmm3,%xmm3 + vextracti32x4 $0,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %xmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %xmm29,%xmm3,%xmm19 + vextracti32x4 $0,%zmm19,%xmm7 + subq $16 * (5 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_xqhfeyAhltlBsyF + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_xqhfeyAhltlBsyF +.L_small_initial_partial_block_xqhfeyAhltlBsyF: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_xqhfeyAhltlBsyF: + + orq %r8,%r8 + je .L_after_reduction_xqhfeyAhltlBsyF + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_xqhfeyAhltlBsyF: + jmp .L_last_blocks_done_gmjFjaoGnEhAquD +.L_last_num_blocks_is_6_gmjFjaoGnEhAquD: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $250,%r15d + jae .L_16_blocks_overflow_vuckCplCqacsnkw + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %ymm27,%ymm0,%ymm3 + jmp .L_16_blocks_ok_vuckCplCqacsnkw + +.L_16_blocks_overflow_vuckCplCqacsnkw: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %ymm29,%ymm3,%ymm3 +.L_16_blocks_ok_vuckCplCqacsnkw: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %ymm30,%ymm3,%ymm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %ymm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %ymm29,%ymm3,%ymm19 + vextracti32x4 $1,%zmm19,%xmm7 + subq $16 * (6 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_ruAuuqlioaFhuzd + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_ruAuuqlioaFhuzd +.L_small_initial_partial_block_ruAuuqlioaFhuzd: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_ruAuuqlioaFhuzd: + + orq %r8,%r8 + je .L_after_reduction_ruAuuqlioaFhuzd + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_ruAuuqlioaFhuzd: + jmp .L_last_blocks_done_gmjFjaoGnEhAquD +.L_last_num_blocks_is_7_gmjFjaoGnEhAquD: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $249,%r15d + jae .L_16_blocks_overflow_vxwemaBiapgApmr + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_vxwemaBiapgApmr + +.L_16_blocks_overflow_vxwemaBiapgApmr: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_vxwemaBiapgApmr: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $2,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vextracti32x4 $2,%zmm19,%xmm7 + subq $16 * (7 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_wdpAcmnbkmzzufl + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_wdpAcmnbkmzzufl +.L_small_initial_partial_block_wdpAcmnbkmzzufl: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_wdpAcmnbkmzzufl: + + orq %r8,%r8 + je .L_after_reduction_wdpAcmnbkmzzufl + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_wdpAcmnbkmzzufl: + jmp .L_last_blocks_done_gmjFjaoGnEhAquD +.L_last_num_blocks_is_8_gmjFjaoGnEhAquD: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $248,%r15d + jae .L_16_blocks_overflow_kuexuhgEceqggje + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_kuexuhgEceqggje + +.L_16_blocks_overflow_kuexuhgEceqggje: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_kuexuhgEceqggje: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $3,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vextracti32x4 $3,%zmm19,%xmm7 + subq $16 * (8 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_tvzmBcComjdtAzn + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_tvzmBcComjdtAzn +.L_small_initial_partial_block_tvzmBcComjdtAzn: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_tvzmBcComjdtAzn: + + orq %r8,%r8 + je .L_after_reduction_tvzmBcComjdtAzn + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_tvzmBcComjdtAzn: + jmp .L_last_blocks_done_gmjFjaoGnEhAquD +.L_last_num_blocks_is_9_gmjFjaoGnEhAquD: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $247,%r15d + jae .L_16_blocks_overflow_npAFwfijqmcuehu + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %xmm27,%xmm3,%xmm4 + jmp .L_16_blocks_ok_npAFwfijqmcuehu + +.L_16_blocks_overflow_npAFwfijqmcuehu: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %xmm29,%xmm4,%xmm4 +.L_16_blocks_ok_npAFwfijqmcuehu: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %xmm30,%xmm4,%xmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %xmm20,%xmm4,%xmm4 + vextracti32x4 $0,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %xmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %xmm29,%xmm4,%xmm20 + vextracti32x4 $0,%zmm20,%xmm7 + subq $16 * (9 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_gxddwsBBhjrmGda + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_gxddwsBBhjrmGda +.L_small_initial_partial_block_gxddwsBBhjrmGda: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_gxddwsBBhjrmGda: + + orq %r8,%r8 + je .L_after_reduction_gxddwsBBhjrmGda + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_gxddwsBBhjrmGda: + jmp .L_last_blocks_done_gmjFjaoGnEhAquD +.L_last_num_blocks_is_10_gmjFjaoGnEhAquD: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $246,%r15d + jae .L_16_blocks_overflow_hvAwbmhkGhGravm + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %ymm27,%ymm3,%ymm4 + jmp .L_16_blocks_ok_hvAwbmhkGhGravm + +.L_16_blocks_overflow_hvAwbmhkGhGravm: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %ymm29,%ymm4,%ymm4 +.L_16_blocks_ok_hvAwbmhkGhGravm: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %ymm30,%ymm4,%ymm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %ymm20,%ymm4,%ymm4 + vextracti32x4 $1,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %ymm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %ymm29,%ymm4,%ymm20 + vextracti32x4 $1,%zmm20,%xmm7 + subq $16 * (10 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_bjwDcmjtGlgmwEb + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_bjwDcmjtGlgmwEb +.L_small_initial_partial_block_bjwDcmjtGlgmwEb: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_bjwDcmjtGlgmwEb: + + orq %r8,%r8 + je .L_after_reduction_bjwDcmjtGlgmwEb + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_bjwDcmjtGlgmwEb: + jmp .L_last_blocks_done_gmjFjaoGnEhAquD +.L_last_num_blocks_is_11_gmjFjaoGnEhAquD: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $245,%r15d + jae .L_16_blocks_overflow_BhqdCBAEnwmDwhl + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_BhqdCBAEnwmDwhl + +.L_16_blocks_overflow_BhqdCBAEnwmDwhl: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_BhqdCBAEnwmDwhl: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $2,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vextracti32x4 $2,%zmm20,%xmm7 + subq $16 * (11 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_ipuaxhAChCElalm + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_ipuaxhAChCElalm +.L_small_initial_partial_block_ipuaxhAChCElalm: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_ipuaxhAChCElalm: + + orq %r8,%r8 + je .L_after_reduction_ipuaxhAChCElalm + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_ipuaxhAChCElalm: + jmp .L_last_blocks_done_gmjFjaoGnEhAquD +.L_last_num_blocks_is_12_gmjFjaoGnEhAquD: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $244,%r15d + jae .L_16_blocks_overflow_ckykbBijvpyDxDm + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_ckykbBijvpyDxDm + +.L_16_blocks_overflow_ckykbBijvpyDxDm: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_ckykbBijvpyDxDm: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $3,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vextracti32x4 $3,%zmm20,%xmm7 + subq $16 * (12 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_mkzFsudzBDhjcvh + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_mkzFsudzBDhjcvh +.L_small_initial_partial_block_mkzFsudzBDhjcvh: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_mkzFsudzBDhjcvh: + + orq %r8,%r8 + je .L_after_reduction_mkzFsudzBDhjcvh + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_mkzFsudzBDhjcvh: + jmp .L_last_blocks_done_gmjFjaoGnEhAquD +.L_last_num_blocks_is_13_gmjFjaoGnEhAquD: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $243,%r15d + jae .L_16_blocks_overflow_DjGBFpAkClvxnAD + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %xmm27,%xmm4,%xmm5 + jmp .L_16_blocks_ok_DjGBFpAkClvxnAD + +.L_16_blocks_overflow_DjGBFpAkClvxnAD: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %xmm29,%xmm5,%xmm5 +.L_16_blocks_ok_DjGBFpAkClvxnAD: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %xmm30,%xmm5,%xmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %xmm21,%xmm5,%xmm5 + vextracti32x4 $0,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %xmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %xmm29,%xmm5,%xmm21 + vextracti32x4 $0,%zmm21,%xmm7 + subq $16 * (13 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_lygCkeDknmvaExs + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_lygCkeDknmvaExs +.L_small_initial_partial_block_lygCkeDknmvaExs: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_lygCkeDknmvaExs: + + orq %r8,%r8 + je .L_after_reduction_lygCkeDknmvaExs + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_lygCkeDknmvaExs: + jmp .L_last_blocks_done_gmjFjaoGnEhAquD +.L_last_num_blocks_is_14_gmjFjaoGnEhAquD: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $242,%r15d + jae .L_16_blocks_overflow_mxbEwfimcnwvdax + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %ymm27,%ymm4,%ymm5 + jmp .L_16_blocks_ok_mxbEwfimcnwvdax + +.L_16_blocks_overflow_mxbEwfimcnwvdax: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %ymm29,%ymm5,%ymm5 +.L_16_blocks_ok_mxbEwfimcnwvdax: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %ymm30,%ymm5,%ymm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %ymm21,%ymm5,%ymm5 + vextracti32x4 $1,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %ymm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %ymm29,%ymm5,%ymm21 + vextracti32x4 $1,%zmm21,%xmm7 + subq $16 * (14 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_bdGmCjdgnqqlltq + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_bdGmCjdgnqqlltq +.L_small_initial_partial_block_bdGmCjdgnqqlltq: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_bdGmCjdgnqqlltq: + + orq %r8,%r8 + je .L_after_reduction_bdGmCjdgnqqlltq + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_bdGmCjdgnqqlltq: + jmp .L_last_blocks_done_gmjFjaoGnEhAquD +.L_last_num_blocks_is_15_gmjFjaoGnEhAquD: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $241,%r15d + jae .L_16_blocks_overflow_zgjqhDpFicvrFBk + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_zgjqhDpFicvrFBk + +.L_16_blocks_overflow_zgjqhDpFicvrFBk: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_zgjqhDpFicvrFBk: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $2,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %zmm29,%zmm5,%zmm21 + vextracti32x4 $2,%zmm21,%xmm7 + subq $16 * (15 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_DiAChhgwveonFpA + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_DiAChhgwveonFpA +.L_small_initial_partial_block_DiAChhgwveonFpA: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_DiAChhgwveonFpA: + + orq %r8,%r8 + je .L_after_reduction_DiAChhgwveonFpA + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_DiAChhgwveonFpA: + jmp .L_last_blocks_done_gmjFjaoGnEhAquD +.L_last_num_blocks_is_16_gmjFjaoGnEhAquD: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $240,%r15d + jae .L_16_blocks_overflow_yyltxtltrzdqBtp + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_yyltxtltrzdqBtp + +.L_16_blocks_overflow_yyltxtltrzdqBtp: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_yyltxtltrzdqBtp: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $3,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %zmm29,%zmm5,%zmm21 + vextracti32x4 $3,%zmm21,%xmm7 + subq $16 * (16 - 1),%r8 +.L_small_initial_partial_block_GsrEfbqkvAdwclh: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_GsrEfbqkvAdwclh: + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_GsrEfbqkvAdwclh: + jmp .L_last_blocks_done_gmjFjaoGnEhAquD +.L_last_num_blocks_is_0_gmjFjaoGnEhAquD: + vmovdqa64 768(%rsp),%zmm13 + vpxorq %zmm14,%zmm13,%zmm13 + vmovdqu64 0(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 832(%rsp),%zmm13 + vmovdqu64 64(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpxorq %zmm10,%zmm4,%zmm26 + vpxorq %zmm6,%zmm0,%zmm24 + vpxorq %zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 896(%rsp),%zmm13 + vmovdqu64 128(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 960(%rsp),%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm4 + + + vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0 + vpslldq $8,%xmm0,%xmm0 + vpxorq %xmm0,%xmm25,%xmm0 + + + vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3 + vpsrldq $4,%xmm3,%xmm3 + vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm3,%xmm14 + +.L_last_blocks_done_gmjFjaoGnEhAquD: + vpshufb %xmm29,%xmm2,%xmm2 + jmp .L_ghash_done_eawnuBpGmxcBoDC + +.L_message_below_equal_16_blocks_eawnuBpGmxcBoDC: + + + movl %r8d,%r12d + addl $15,%r12d + shrl $4,%r12d + cmpq $8,%r12 + je .L_small_initial_num_blocks_is_8_hbqugjruGfgczBp + jl .L_small_initial_num_blocks_is_7_1_hbqugjruGfgczBp + + + cmpq $12,%r12 + je .L_small_initial_num_blocks_is_12_hbqugjruGfgczBp + jl .L_small_initial_num_blocks_is_11_9_hbqugjruGfgczBp + + + cmpq $16,%r12 + je .L_small_initial_num_blocks_is_16_hbqugjruGfgczBp + cmpq $15,%r12 + je .L_small_initial_num_blocks_is_15_hbqugjruGfgczBp + cmpq $14,%r12 + je .L_small_initial_num_blocks_is_14_hbqugjruGfgczBp + jmp .L_small_initial_num_blocks_is_13_hbqugjruGfgczBp + +.L_small_initial_num_blocks_is_11_9_hbqugjruGfgczBp: + + cmpq $11,%r12 + je .L_small_initial_num_blocks_is_11_hbqugjruGfgczBp + cmpq $10,%r12 + je .L_small_initial_num_blocks_is_10_hbqugjruGfgczBp + jmp .L_small_initial_num_blocks_is_9_hbqugjruGfgczBp + +.L_small_initial_num_blocks_is_7_1_hbqugjruGfgczBp: + cmpq $4,%r12 + je .L_small_initial_num_blocks_is_4_hbqugjruGfgczBp + jl .L_small_initial_num_blocks_is_3_1_hbqugjruGfgczBp + + cmpq $7,%r12 + je .L_small_initial_num_blocks_is_7_hbqugjruGfgczBp + cmpq $6,%r12 + je .L_small_initial_num_blocks_is_6_hbqugjruGfgczBp + jmp .L_small_initial_num_blocks_is_5_hbqugjruGfgczBp + +.L_small_initial_num_blocks_is_3_1_hbqugjruGfgczBp: + + cmpq $3,%r12 + je .L_small_initial_num_blocks_is_3_hbqugjruGfgczBp + cmpq $2,%r12 + je .L_small_initial_num_blocks_is_2_hbqugjruGfgczBp + + + + + +.L_small_initial_num_blocks_is_1_hbqugjruGfgczBp: + vmovdqa64 SHUF_MASK(%rip),%xmm29 + vpaddd ONE(%rip),%xmm2,%xmm0 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $0,%zmm0,%xmm2 + vpshufb %xmm29,%xmm0,%xmm0 + vmovdqu8 0(%rcx,%r11,1),%xmm6{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 208(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 224(%rdi),%zmm15 + vaesenclast %xmm15,%xmm0,%xmm0 + vpxorq %xmm6,%xmm0,%xmm0 + vextracti32x4 $0,%zmm0,%xmm12 + movq %r9,%r10 + vmovdqu8 %xmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %xmm29,%xmm0,%xmm6 + vextracti32x4 $0,%zmm6,%xmm13 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_iFmDdgrbxxlznyd + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm6,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm6,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm6,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm6,%xmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_iFmDdgrbxxlznyd +.L_small_initial_partial_block_iFmDdgrbxxlznyd: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + + + + + + + + + + + + vpxorq %xmm13,%xmm14,%xmm14 + + jmp .L_after_reduction_iFmDdgrbxxlznyd +.L_small_initial_compute_done_iFmDdgrbxxlznyd: +.L_after_reduction_iFmDdgrbxxlznyd: + jmp .L_small_initial_blocks_encrypted_hbqugjruGfgczBp +.L_small_initial_num_blocks_is_2_hbqugjruGfgczBp: + vmovdqa64 SHUF_MASK(%rip),%ymm29 + vshufi64x2 $0,%ymm2,%ymm2,%ymm0 + vpaddd ddq_add_1234(%rip),%ymm0,%ymm0 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $1,%zmm0,%xmm2 + vpshufb %ymm29,%ymm0,%ymm0 + vmovdqu8 0(%rcx,%r11,1),%ymm6{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 208(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 224(%rdi),%zmm15 + vaesenclast %ymm15,%ymm0,%ymm0 + vpxorq %ymm6,%ymm0,%ymm0 + vextracti32x4 $1,%zmm0,%xmm12 + movq %r9,%r10 + vmovdqu8 %ymm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %ymm29,%ymm0,%ymm6 + vextracti32x4 $1,%zmm6,%xmm13 + subq $16 * (2 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_EsCbfxikCrkamtE + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm6,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm6,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm6,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm6,%ymm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_EsCbfxikCrkamtE +.L_small_initial_partial_block_EsCbfxikCrkamtE: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm6,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm6,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm6,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm6,%xmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_EsCbfxikCrkamtE: + + orq %r8,%r8 + je .L_after_reduction_EsCbfxikCrkamtE + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_EsCbfxikCrkamtE: + jmp .L_small_initial_blocks_encrypted_hbqugjruGfgczBp +.L_small_initial_num_blocks_is_3_hbqugjruGfgczBp: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $2,%zmm0,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vmovdqu8 0(%rcx,%r11,1),%zmm6{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 208(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 224(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vpxorq %zmm6,%zmm0,%zmm0 + vextracti32x4 $2,%zmm0,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vextracti32x4 $2,%zmm6,%xmm13 + subq $16 * (3 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_tBEoFGBxxBysmml + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_tBEoFGBxxBysmml +.L_small_initial_partial_block_tBEoFGBxxBysmml: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm6,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm6,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm6,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm6,%ymm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_tBEoFGBxxBysmml: + + orq %r8,%r8 + je .L_after_reduction_tBEoFGBxxBysmml + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_tBEoFGBxxBysmml: + jmp .L_small_initial_blocks_encrypted_hbqugjruGfgczBp +.L_small_initial_num_blocks_is_4_hbqugjruGfgczBp: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $3,%zmm0,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vmovdqu8 0(%rcx,%r11,1),%zmm6{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 208(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 224(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vpxorq %zmm6,%zmm0,%zmm0 + vextracti32x4 $3,%zmm0,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vextracti32x4 $3,%zmm6,%xmm13 + subq $16 * (4 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_dDrxftiGhnzzsCu + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 288(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + + vpxorq %zmm19,%zmm17,%zmm17 + vpsrldq $8,%zmm17,%zmm4 + vpslldq $8,%zmm17,%zmm5 + vpxorq %zmm4,%zmm15,%zmm0 + vpxorq %zmm5,%zmm16,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_dDrxftiGhnzzsCu +.L_small_initial_partial_block_dDrxftiGhnzzsCu: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_dDrxftiGhnzzsCu: + + orq %r8,%r8 + je .L_after_reduction_dDrxftiGhnzzsCu + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_dDrxftiGhnzzsCu: + jmp .L_small_initial_blocks_encrypted_hbqugjruGfgczBp +.L_small_initial_num_blocks_is_5_hbqugjruGfgczBp: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $64,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $0,%zmm3,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %xmm29,%xmm3,%xmm3 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%xmm7{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 208(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 224(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %xmm15,%xmm3,%xmm3 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %xmm7,%xmm3,%xmm3 + vextracti32x4 $0,%zmm3,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %xmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vpshufb %xmm29,%xmm3,%xmm7 + vextracti32x4 $0,%zmm7,%xmm13 + subq $16 * (5 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_tgluGdkfFDhsixe + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 272(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm7,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm7,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm7,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm7,%xmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_tgluGdkfFDhsixe +.L_small_initial_partial_block_tgluGdkfFDhsixe: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 288(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + + vpxorq %zmm19,%zmm17,%zmm17 + vpsrldq $8,%zmm17,%zmm4 + vpslldq $8,%zmm17,%zmm5 + vpxorq %zmm4,%zmm15,%zmm0 + vpxorq %zmm5,%zmm16,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_tgluGdkfFDhsixe: + + orq %r8,%r8 + je .L_after_reduction_tgluGdkfFDhsixe + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_tgluGdkfFDhsixe: + jmp .L_small_initial_blocks_encrypted_hbqugjruGfgczBp +.L_small_initial_num_blocks_is_6_hbqugjruGfgczBp: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $64,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $1,%zmm3,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %ymm29,%ymm3,%ymm3 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%ymm7{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 208(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 224(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %ymm15,%ymm3,%ymm3 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %ymm7,%ymm3,%ymm3 + vextracti32x4 $1,%zmm3,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %ymm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vpshufb %ymm29,%ymm3,%ymm7 + vextracti32x4 $1,%zmm7,%xmm13 + subq $16 * (6 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_cDptiniAjeCvsaA + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 256(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm7,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm7,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm7,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm7,%ymm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_cDptiniAjeCvsaA +.L_small_initial_partial_block_cDptiniAjeCvsaA: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 272(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm7,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm7,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm7,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm7,%xmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_cDptiniAjeCvsaA: + + orq %r8,%r8 + je .L_after_reduction_cDptiniAjeCvsaA + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_cDptiniAjeCvsaA: + jmp .L_small_initial_blocks_encrypted_hbqugjruGfgczBp +.L_small_initial_num_blocks_is_7_hbqugjruGfgczBp: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $64,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $2,%zmm3,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 208(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 224(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vextracti32x4 $2,%zmm3,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vpshufb %zmm29,%zmm3,%zmm7 + vextracti32x4 $2,%zmm7,%xmm13 + subq $16 * (7 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_CkuomECEjoqBFyr + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 240(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_CkuomECEjoqBFyr +.L_small_initial_partial_block_CkuomECEjoqBFyr: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 256(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm7,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm7,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm7,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm7,%ymm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_CkuomECEjoqBFyr: + + orq %r8,%r8 + je .L_after_reduction_CkuomECEjoqBFyr + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_CkuomECEjoqBFyr: + jmp .L_small_initial_blocks_encrypted_hbqugjruGfgczBp +.L_small_initial_num_blocks_is_8_hbqugjruGfgczBp: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $64,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $3,%zmm3,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 208(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 224(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vextracti32x4 $3,%zmm3,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vpshufb %zmm29,%zmm3,%zmm7 + vextracti32x4 $3,%zmm7,%xmm13 + subq $16 * (8 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_jetFsEuskrjwged + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 224(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 288(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + + vpxorq %zmm19,%zmm17,%zmm17 + vpsrldq $8,%zmm17,%zmm4 + vpslldq $8,%zmm17,%zmm5 + vpxorq %zmm4,%zmm15,%zmm0 + vpxorq %zmm5,%zmm16,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_jetFsEuskrjwged +.L_small_initial_partial_block_jetFsEuskrjwged: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 240(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_jetFsEuskrjwged: + + orq %r8,%r8 + je .L_after_reduction_jetFsEuskrjwged + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_jetFsEuskrjwged: + jmp .L_small_initial_blocks_encrypted_hbqugjruGfgczBp +.L_small_initial_num_blocks_is_9_hbqugjruGfgczBp: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $128,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $0,%zmm4,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %xmm29,%xmm4,%xmm4 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%xmm10{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 208(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 224(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %xmm15,%xmm4,%xmm4 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %xmm10,%xmm4,%xmm4 + vextracti32x4 $0,%zmm4,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %xmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vpshufb %zmm29,%zmm3,%zmm7 + vpshufb %xmm29,%xmm4,%xmm10 + vextracti32x4 $0,%zmm10,%xmm13 + subq $16 * (9 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_djtvlDCcmtClCqd + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 208(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 272(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm10,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm10,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm10,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm10,%xmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_djtvlDCcmtClCqd +.L_small_initial_partial_block_djtvlDCcmtClCqd: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 224(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 288(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + + vpxorq %zmm19,%zmm17,%zmm17 + vpsrldq $8,%zmm17,%zmm4 + vpslldq $8,%zmm17,%zmm5 + vpxorq %zmm4,%zmm15,%zmm0 + vpxorq %zmm5,%zmm16,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_djtvlDCcmtClCqd: + + orq %r8,%r8 + je .L_after_reduction_djtvlDCcmtClCqd + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_djtvlDCcmtClCqd: + jmp .L_small_initial_blocks_encrypted_hbqugjruGfgczBp +.L_small_initial_num_blocks_is_10_hbqugjruGfgczBp: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $128,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $1,%zmm4,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %ymm29,%ymm4,%ymm4 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%ymm10{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 208(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 224(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %ymm15,%ymm4,%ymm4 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %ymm10,%ymm4,%ymm4 + vextracti32x4 $1,%zmm4,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %ymm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vpshufb %zmm29,%zmm3,%zmm7 + vpshufb %ymm29,%ymm4,%ymm10 + vextracti32x4 $1,%zmm10,%xmm13 + subq $16 * (10 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_aptugwefEgbpisD + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 192(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 256(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm10,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm10,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm10,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm10,%ymm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_aptugwefEgbpisD +.L_small_initial_partial_block_aptugwefEgbpisD: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 208(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 272(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm10,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm10,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm10,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm10,%xmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_aptugwefEgbpisD: + + orq %r8,%r8 + je .L_after_reduction_aptugwefEgbpisD + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_aptugwefEgbpisD: + jmp .L_small_initial_blocks_encrypted_hbqugjruGfgczBp +.L_small_initial_num_blocks_is_11_hbqugjruGfgczBp: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $128,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $2,%zmm4,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%zmm10{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 208(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 224(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %zmm15,%zmm4,%zmm4 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %zmm10,%zmm4,%zmm4 + vextracti32x4 $2,%zmm4,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vpshufb %zmm29,%zmm3,%zmm7 + vpshufb %zmm29,%zmm4,%zmm10 + vextracti32x4 $2,%zmm10,%xmm13 + subq $16 * (11 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_BboqcvvuFoyragm + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 176(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 240(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_BboqcvvuFoyragm +.L_small_initial_partial_block_BboqcvvuFoyragm: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 192(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 256(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm10,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm10,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm10,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm10,%ymm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_BboqcvvuFoyragm: + + orq %r8,%r8 + je .L_after_reduction_BboqcvvuFoyragm + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_BboqcvvuFoyragm: + jmp .L_small_initial_blocks_encrypted_hbqugjruGfgczBp +.L_small_initial_num_blocks_is_12_hbqugjruGfgczBp: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $128,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $3,%zmm4,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%zmm10{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 208(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 224(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %zmm15,%zmm4,%zmm4 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %zmm10,%zmm4,%zmm4 + vextracti32x4 $3,%zmm4,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vpshufb %zmm29,%zmm3,%zmm7 + vpshufb %zmm29,%zmm4,%zmm10 + vextracti32x4 $3,%zmm10,%xmm13 + subq $16 * (12 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_yzpAqvxjrjtpbge + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 160(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 224(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 288(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + + vpxorq %zmm19,%zmm17,%zmm17 + vpsrldq $8,%zmm17,%zmm4 + vpslldq $8,%zmm17,%zmm5 + vpxorq %zmm4,%zmm15,%zmm0 + vpxorq %zmm5,%zmm16,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_yzpAqvxjrjtpbge +.L_small_initial_partial_block_yzpAqvxjrjtpbge: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 176(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 240(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_yzpAqvxjrjtpbge: + + orq %r8,%r8 + je .L_after_reduction_yzpAqvxjrjtpbge + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_yzpAqvxjrjtpbge: + jmp .L_small_initial_blocks_encrypted_hbqugjruGfgczBp +.L_small_initial_num_blocks_is_13_hbqugjruGfgczBp: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + vpaddd ddq_add_8888(%rip),%zmm3,%zmm5 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $192,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $0,%zmm5,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %xmm29,%xmm5,%xmm5 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%zmm10 + vmovdqu8 192(%rcx,%r11,1),%xmm11{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %zmm15,%zmm4,%zmm4 + vpxorq %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 208(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 224(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %zmm15,%zmm4,%zmm4 + vaesenclast %xmm15,%xmm5,%xmm5 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %zmm10,%zmm4,%zmm4 + vpxorq %xmm11,%xmm5,%xmm5 + vextracti32x4 $0,%zmm5,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %xmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vpshufb %zmm29,%zmm3,%zmm7 + vpshufb %zmm29,%zmm4,%zmm10 + vpshufb %xmm29,%xmm5,%xmm11 + vextracti32x4 $0,%zmm11,%xmm13 + subq $16 * (13 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_jjkyzlqDAbpoEdw + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 144(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 208(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 272(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm11,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm11,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm11,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm11,%xmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_jjkyzlqDAbpoEdw +.L_small_initial_partial_block_jjkyzlqDAbpoEdw: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 160(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 224(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 288(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + + vpxorq %zmm19,%zmm17,%zmm17 + vpsrldq $8,%zmm17,%zmm4 + vpslldq $8,%zmm17,%zmm5 + vpxorq %zmm4,%zmm15,%zmm0 + vpxorq %zmm5,%zmm16,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_jjkyzlqDAbpoEdw: + + orq %r8,%r8 + je .L_after_reduction_jjkyzlqDAbpoEdw + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_jjkyzlqDAbpoEdw: + jmp .L_small_initial_blocks_encrypted_hbqugjruGfgczBp +.L_small_initial_num_blocks_is_14_hbqugjruGfgczBp: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + vpaddd ddq_add_8888(%rip),%zmm3,%zmm5 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $192,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $1,%zmm5,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %ymm29,%ymm5,%ymm5 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%zmm10 + vmovdqu8 192(%rcx,%r11,1),%ymm11{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %zmm15,%zmm4,%zmm4 + vpxorq %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 208(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 224(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %zmm15,%zmm4,%zmm4 + vaesenclast %ymm15,%ymm5,%ymm5 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %zmm10,%zmm4,%zmm4 + vpxorq %ymm11,%ymm5,%ymm5 + vextracti32x4 $1,%zmm5,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %ymm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vpshufb %zmm29,%zmm3,%zmm7 + vpshufb %zmm29,%zmm4,%zmm10 + vpshufb %ymm29,%ymm5,%ymm11 + vextracti32x4 $1,%zmm11,%xmm13 + subq $16 * (14 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_GlbsvkxecbisEEg + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 128(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 192(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 256(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm11,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm11,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm11,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm11,%ymm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_GlbsvkxecbisEEg +.L_small_initial_partial_block_GlbsvkxecbisEEg: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 144(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 208(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 272(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm11,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm11,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm11,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm11,%xmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_GlbsvkxecbisEEg: + + orq %r8,%r8 + je .L_after_reduction_GlbsvkxecbisEEg + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_GlbsvkxecbisEEg: + jmp .L_small_initial_blocks_encrypted_hbqugjruGfgczBp +.L_small_initial_num_blocks_is_15_hbqugjruGfgczBp: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + vpaddd ddq_add_8888(%rip),%zmm3,%zmm5 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $192,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $2,%zmm5,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%zmm10 + vmovdqu8 192(%rcx,%r11,1),%zmm11{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %zmm15,%zmm4,%zmm4 + vpxorq %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 208(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 224(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %zmm15,%zmm4,%zmm4 + vaesenclast %zmm15,%zmm5,%zmm5 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %zmm10,%zmm4,%zmm4 + vpxorq %zmm11,%zmm5,%zmm5 + vextracti32x4 $2,%zmm5,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vpshufb %zmm29,%zmm3,%zmm7 + vpshufb %zmm29,%zmm4,%zmm10 + vpshufb %zmm29,%zmm5,%zmm11 + vextracti32x4 $2,%zmm11,%xmm13 + subq $16 * (15 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_BFutaboihmcgqcA + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 112(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 176(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 240(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm11,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm11,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm11,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm11,%zmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_BFutaboihmcgqcA +.L_small_initial_partial_block_BFutaboihmcgqcA: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 128(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 192(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 256(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm11,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm11,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm11,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm11,%ymm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_BFutaboihmcgqcA: + + orq %r8,%r8 + je .L_after_reduction_BFutaboihmcgqcA + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_BFutaboihmcgqcA: + jmp .L_small_initial_blocks_encrypted_hbqugjruGfgczBp +.L_small_initial_num_blocks_is_16_hbqugjruGfgczBp: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + vpaddd ddq_add_8888(%rip),%zmm3,%zmm5 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $192,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $3,%zmm5,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%zmm10 + vmovdqu8 192(%rcx,%r11,1),%zmm11{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %zmm15,%zmm4,%zmm4 + vpxorq %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 208(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 224(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %zmm15,%zmm4,%zmm4 + vaesenclast %zmm15,%zmm5,%zmm5 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %zmm10,%zmm4,%zmm4 + vpxorq %zmm11,%zmm5,%zmm5 + vextracti32x4 $3,%zmm5,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vpshufb %zmm29,%zmm3,%zmm7 + vpshufb %zmm29,%zmm4,%zmm10 + vpshufb %zmm29,%zmm5,%zmm11 + vextracti32x4 $3,%zmm11,%xmm13 + subq $16 * (16 - 1),%r8 +.L_small_initial_partial_block_AxxoDBglqjscnzw: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 112(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 176(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 240(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm11,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm11,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm11,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm11,%zmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_AxxoDBglqjscnzw: + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_AxxoDBglqjscnzw: +.L_small_initial_blocks_encrypted_hbqugjruGfgczBp: +.L_ghash_done_eawnuBpGmxcBoDC: + vmovdqu64 %xmm2,0(%rsi) + vmovdqu64 %xmm14,64(%rsi) +.L_enc_dec_done_eawnuBpGmxcBoDC: + jmp .Lexit_gcm_encrypt +.Lexit_gcm_encrypt: + cmpq $256,%r8 + jbe .Lskip_hkeys_cleanup_FwyhaGceDljchpo + vpxor %xmm0,%xmm0,%xmm0 + vmovdqa64 %zmm0,0(%rsp) + vmovdqa64 %zmm0,64(%rsp) + vmovdqa64 %zmm0,128(%rsp) + vmovdqa64 %zmm0,192(%rsp) + vmovdqa64 %zmm0,256(%rsp) + vmovdqa64 %zmm0,320(%rsp) + vmovdqa64 %zmm0,384(%rsp) + vmovdqa64 %zmm0,448(%rsp) + vmovdqa64 %zmm0,512(%rsp) + vmovdqa64 %zmm0,576(%rsp) + vmovdqa64 %zmm0,640(%rsp) + vmovdqa64 %zmm0,704(%rsp) +.Lskip_hkeys_cleanup_FwyhaGceDljchpo: + vzeroupper + leaq (%rbp),%rsp +.cfi_def_cfa_register %rsp + popq %r15 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r15 + popq %r14 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r14 + popq %r13 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r13 + popq %r12 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r12 + popq %rbp +.cfi_adjust_cfa_offset -8 +.cfi_restore %rbp + popq %rbx +.cfi_adjust_cfa_offset -8 +.cfi_restore %rbx + .byte 0xf3,0xc3 +.Lencrypt_seh_end: +.cfi_endproc +.size ossl_aes_gcm_encrypt_avx512, .-ossl_aes_gcm_encrypt_avx512 +.globl ossl_aes_gcm_decrypt_avx512 +.type ossl_aes_gcm_decrypt_avx512,@function +.align 32 +ossl_aes_gcm_decrypt_avx512: +.cfi_startproc +.Ldecrypt_seh_begin: +.byte 243,15,30,250 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 +.Ldecrypt_seh_push_rbx: + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 +.Ldecrypt_seh_push_rbp: + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 +.Ldecrypt_seh_push_r12: + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 +.Ldecrypt_seh_push_r13: + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 +.Ldecrypt_seh_push_r14: + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 +.Ldecrypt_seh_push_r15: + + + + + + + + + + + leaq 0(%rsp),%rbp +.cfi_def_cfa_register %rbp +.Ldecrypt_seh_setfp: + +.Ldecrypt_seh_prolog_end: + subq $1588,%rsp + andq $(-64),%rsp + + + movl 240(%rdi),%eax + cmpl $9,%eax + je .Laes_gcm_decrypt_128_avx512 + cmpl $11,%eax + je .Laes_gcm_decrypt_192_avx512 + cmpl $13,%eax + je .Laes_gcm_decrypt_256_avx512 + xorl %eax,%eax + jmp .Lexit_gcm_decrypt +.align 32 +.Laes_gcm_decrypt_128_avx512: + orq %r8,%r8 + je .L_enc_dec_done_brADimEeCnCcDmv + xorq %r14,%r14 + vmovdqu64 64(%rsi),%xmm14 + + movq (%rdx),%r11 + orq %r11,%r11 + je .L_partial_block_done_bsCeAyqpAAwsgvv + movl $16,%r10d + leaq byte_len_to_mask_table(%rip),%r12 + cmpq %r10,%r8 + cmovcq %r8,%r10 + kmovw (%r12,%r10,2),%k1 + vmovdqu8 (%rcx),%xmm0{%k1}{z} + + vmovdqu64 16(%rsi),%xmm3 + vmovdqu64 336(%rsi),%xmm4 + + + + leaq SHIFT_MASK(%rip),%r12 + addq %r11,%r12 + vmovdqu64 (%r12),%xmm5 + vpshufb %xmm5,%xmm3,%xmm3 + + vmovdqa64 %xmm0,%xmm6 + vpxorq %xmm0,%xmm3,%xmm3 + + + leaq (%r8,%r11,1),%r13 + subq $16,%r13 + jge .L_no_extra_mask_bsCeAyqpAAwsgvv + subq %r13,%r12 +.L_no_extra_mask_bsCeAyqpAAwsgvv: + + + + vmovdqu64 16(%r12),%xmm0 + vpand %xmm0,%xmm3,%xmm3 + vpand %xmm0,%xmm6,%xmm6 + vpshufb SHUF_MASK(%rip),%xmm6,%xmm6 + vpshufb %xmm5,%xmm6,%xmm6 + vpxorq %xmm6,%xmm14,%xmm14 + cmpq $0,%r13 + jl .L_partial_incomplete_bsCeAyqpAAwsgvv + + vpclmulqdq $0x11,%xmm4,%xmm14,%xmm7 + vpclmulqdq $0x00,%xmm4,%xmm14,%xmm10 + vpclmulqdq $0x01,%xmm4,%xmm14,%xmm11 + vpclmulqdq $0x10,%xmm4,%xmm14,%xmm14 + vpxorq %xmm11,%xmm14,%xmm14 + + vpsrldq $8,%xmm14,%xmm11 + vpslldq $8,%xmm14,%xmm14 + vpxorq %xmm11,%xmm7,%xmm7 + vpxorq %xmm10,%xmm14,%xmm14 + + + + vmovdqu64 POLY2(%rip),%xmm11 + + vpclmulqdq $0x01,%xmm14,%xmm11,%xmm10 + vpslldq $8,%xmm10,%xmm10 + vpxorq %xmm10,%xmm14,%xmm14 + + + + vpclmulqdq $0x00,%xmm14,%xmm11,%xmm10 + vpsrldq $4,%xmm10,%xmm10 + vpclmulqdq $0x10,%xmm14,%xmm11,%xmm14 + vpslldq $4,%xmm14,%xmm14 + + vpternlogq $0x96,%xmm10,%xmm7,%xmm14 + + movq $0,(%rdx) + + movq %r11,%r12 + movq $16,%r11 + subq %r12,%r11 + jmp .L_enc_dec_done_bsCeAyqpAAwsgvv + +.L_partial_incomplete_bsCeAyqpAAwsgvv: + addq %r8,(%rdx) + movq %r8,%r11 + +.L_enc_dec_done_bsCeAyqpAAwsgvv: + + + leaq byte_len_to_mask_table(%rip),%r12 + kmovw (%r12,%r11,2),%k1 + vmovdqu64 %xmm14,64(%rsi) + movq %r9,%r12 + vmovdqu8 %xmm3,(%r12){%k1} +.L_partial_block_done_bsCeAyqpAAwsgvv: + vmovdqu64 0(%rsi),%xmm2 + subq %r11,%r8 + je .L_enc_dec_done_brADimEeCnCcDmv + cmpq $256,%r8 + jbe .L_message_below_equal_16_blocks_brADimEeCnCcDmv + + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vmovdqa64 ddq_addbe_4444(%rip),%zmm27 + vmovdqa64 ddq_addbe_1234(%rip),%zmm28 + + + + + + + vmovd %xmm2,%r15d + andl $255,%r15d + + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpshufb %zmm29,%zmm2,%zmm2 + + + + cmpb $240,%r15b + jae .L_next_16_overflow_eghvmbEDtcnDnAu + vpaddd %zmm28,%zmm2,%zmm7 + vpaddd %zmm27,%zmm7,%zmm10 + vpaddd %zmm27,%zmm10,%zmm11 + vpaddd %zmm27,%zmm11,%zmm12 + jmp .L_next_16_ok_eghvmbEDtcnDnAu +.L_next_16_overflow_eghvmbEDtcnDnAu: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm12 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm7 + vpaddd %zmm12,%zmm7,%zmm10 + vpaddd %zmm12,%zmm10,%zmm11 + vpaddd %zmm12,%zmm11,%zmm12 + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %zmm29,%zmm10,%zmm10 + vpshufb %zmm29,%zmm11,%zmm11 + vpshufb %zmm29,%zmm12,%zmm12 +.L_next_16_ok_eghvmbEDtcnDnAu: + vshufi64x2 $255,%zmm12,%zmm12,%zmm2 + addb $16,%r15b + + vmovdqu8 0(%rcx,%r11,1),%zmm0 + vmovdqu8 64(%rcx,%r11,1),%zmm3 + vmovdqu8 128(%rcx,%r11,1),%zmm4 + vmovdqu8 192(%rcx,%r11,1),%zmm5 + + + vbroadcastf64x2 0(%rdi),%zmm6 + vpxorq %zmm6,%zmm7,%zmm7 + vpxorq %zmm6,%zmm10,%zmm10 + vpxorq %zmm6,%zmm11,%zmm11 + vpxorq %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 16(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 32(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 48(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 64(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 80(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 96(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 112(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 128(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 144(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 160(%rdi),%zmm6 + vaesenclast %zmm6,%zmm7,%zmm7 + vaesenclast %zmm6,%zmm10,%zmm10 + vaesenclast %zmm6,%zmm11,%zmm11 + vaesenclast %zmm6,%zmm12,%zmm12 + + + vpxorq %zmm0,%zmm7,%zmm7 + vpxorq %zmm3,%zmm10,%zmm10 + vpxorq %zmm4,%zmm11,%zmm11 + vpxorq %zmm5,%zmm12,%zmm12 + + + movq %r9,%r10 + vmovdqu8 %zmm7,0(%r10,%r11,1) + vmovdqu8 %zmm10,64(%r10,%r11,1) + vmovdqu8 %zmm11,128(%r10,%r11,1) + vmovdqu8 %zmm12,192(%r10,%r11,1) + + vpshufb %zmm29,%zmm0,%zmm7 + vpshufb %zmm29,%zmm3,%zmm10 + vpshufb %zmm29,%zmm4,%zmm11 + vpshufb %zmm29,%zmm5,%zmm12 + vmovdqa64 %zmm7,768(%rsp) + vmovdqa64 %zmm10,832(%rsp) + vmovdqa64 %zmm11,896(%rsp) + vmovdqa64 %zmm12,960(%rsp) + testq %r14,%r14 + jnz .L_skip_hkeys_precomputation_plwezswvdFDdDBp + + vmovdqu64 288(%rsi),%zmm0 + vmovdqu64 %zmm0,704(%rsp) + + vmovdqu64 224(%rsi),%zmm3 + vmovdqu64 %zmm3,640(%rsp) + + + vshufi64x2 $0x00,%zmm3,%zmm3,%zmm3 + + vmovdqu64 160(%rsi),%zmm4 + vmovdqu64 %zmm4,576(%rsp) + + vmovdqu64 96(%rsi),%zmm5 + vmovdqu64 %zmm5,512(%rsp) +.L_skip_hkeys_precomputation_plwezswvdFDdDBp: + cmpq $512,%r8 + jb .L_message_below_32_blocks_brADimEeCnCcDmv + + + + cmpb $240,%r15b + jae .L_next_16_overflow_yieysttglezqCBf + vpaddd %zmm28,%zmm2,%zmm7 + vpaddd %zmm27,%zmm7,%zmm10 + vpaddd %zmm27,%zmm10,%zmm11 + vpaddd %zmm27,%zmm11,%zmm12 + jmp .L_next_16_ok_yieysttglezqCBf +.L_next_16_overflow_yieysttglezqCBf: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm12 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm7 + vpaddd %zmm12,%zmm7,%zmm10 + vpaddd %zmm12,%zmm10,%zmm11 + vpaddd %zmm12,%zmm11,%zmm12 + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %zmm29,%zmm10,%zmm10 + vpshufb %zmm29,%zmm11,%zmm11 + vpshufb %zmm29,%zmm12,%zmm12 +.L_next_16_ok_yieysttglezqCBf: + vshufi64x2 $255,%zmm12,%zmm12,%zmm2 + addb $16,%r15b + + vmovdqu8 256(%rcx,%r11,1),%zmm0 + vmovdqu8 320(%rcx,%r11,1),%zmm3 + vmovdqu8 384(%rcx,%r11,1),%zmm4 + vmovdqu8 448(%rcx,%r11,1),%zmm5 + + + vbroadcastf64x2 0(%rdi),%zmm6 + vpxorq %zmm6,%zmm7,%zmm7 + vpxorq %zmm6,%zmm10,%zmm10 + vpxorq %zmm6,%zmm11,%zmm11 + vpxorq %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 16(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 32(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 48(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 64(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 80(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 96(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 112(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 128(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 144(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 160(%rdi),%zmm6 + vaesenclast %zmm6,%zmm7,%zmm7 + vaesenclast %zmm6,%zmm10,%zmm10 + vaesenclast %zmm6,%zmm11,%zmm11 + vaesenclast %zmm6,%zmm12,%zmm12 + + + vpxorq %zmm0,%zmm7,%zmm7 + vpxorq %zmm3,%zmm10,%zmm10 + vpxorq %zmm4,%zmm11,%zmm11 + vpxorq %zmm5,%zmm12,%zmm12 + + + movq %r9,%r10 + vmovdqu8 %zmm7,256(%r10,%r11,1) + vmovdqu8 %zmm10,320(%r10,%r11,1) + vmovdqu8 %zmm11,384(%r10,%r11,1) + vmovdqu8 %zmm12,448(%r10,%r11,1) + + vpshufb %zmm29,%zmm0,%zmm7 + vpshufb %zmm29,%zmm3,%zmm10 + vpshufb %zmm29,%zmm4,%zmm11 + vpshufb %zmm29,%zmm5,%zmm12 + vmovdqa64 %zmm7,1024(%rsp) + vmovdqa64 %zmm10,1088(%rsp) + vmovdqa64 %zmm11,1152(%rsp) + vmovdqa64 %zmm12,1216(%rsp) + testq %r14,%r14 + jnz .L_skip_hkeys_precomputation_cqhgcscctsdbGkB + vmovdqu64 640(%rsp),%zmm3 + + + vshufi64x2 $0x00,%zmm3,%zmm3,%zmm3 + + vmovdqu64 576(%rsp),%zmm4 + vmovdqu64 512(%rsp),%zmm5 + + vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4 + vpxorq %zmm10,%zmm4,%zmm4 + + vpsrldq $8,%zmm4,%zmm10 + vpslldq $8,%zmm4,%zmm4 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4 + vpslldq $4,%zmm4,%zmm4 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm4 + + vmovdqu64 %zmm4,448(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5 + vpxorq %zmm10,%zmm5,%zmm5 + + vpsrldq $8,%zmm5,%zmm10 + vpslldq $8,%zmm5,%zmm5 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5 + vpslldq $4,%zmm5,%zmm5 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm5 + + vmovdqu64 %zmm5,384(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4 + vpxorq %zmm10,%zmm4,%zmm4 + + vpsrldq $8,%zmm4,%zmm10 + vpslldq $8,%zmm4,%zmm4 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4 + vpslldq $4,%zmm4,%zmm4 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm4 + + vmovdqu64 %zmm4,320(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5 + vpxorq %zmm10,%zmm5,%zmm5 + + vpsrldq $8,%zmm5,%zmm10 + vpslldq $8,%zmm5,%zmm5 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5 + vpslldq $4,%zmm5,%zmm5 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm5 + + vmovdqu64 %zmm5,256(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4 + vpxorq %zmm10,%zmm4,%zmm4 + + vpsrldq $8,%zmm4,%zmm10 + vpslldq $8,%zmm4,%zmm4 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4 + vpslldq $4,%zmm4,%zmm4 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm4 + + vmovdqu64 %zmm4,192(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5 + vpxorq %zmm10,%zmm5,%zmm5 + + vpsrldq $8,%zmm5,%zmm10 + vpslldq $8,%zmm5,%zmm5 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5 + vpslldq $4,%zmm5,%zmm5 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm5 + + vmovdqu64 %zmm5,128(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4 + vpxorq %zmm10,%zmm4,%zmm4 + + vpsrldq $8,%zmm4,%zmm10 + vpslldq $8,%zmm4,%zmm4 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4 + vpslldq $4,%zmm4,%zmm4 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm4 + + vmovdqu64 %zmm4,64(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5 + vpxorq %zmm10,%zmm5,%zmm5 + + vpsrldq $8,%zmm5,%zmm10 + vpslldq $8,%zmm5,%zmm5 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5 + vpslldq $4,%zmm5,%zmm5 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm5 + + vmovdqu64 %zmm5,0(%rsp) +.L_skip_hkeys_precomputation_cqhgcscctsdbGkB: + movq $1,%r14 + addq $512,%r11 + subq $512,%r8 + + cmpq $768,%r8 + jb .L_no_more_big_nblocks_brADimEeCnCcDmv +.L_encrypt_big_nblocks_brADimEeCnCcDmv: + cmpb $240,%r15b + jae .L_16_blocks_overflow_jeuDwtvAfvGmCgt + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_jeuDwtvAfvGmCgt +.L_16_blocks_overflow_jeuDwtvAfvGmCgt: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_jeuDwtvAfvGmCgt: + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp),%zmm1 + + + + + vshufi64x2 $255,%zmm5,%zmm5,%zmm2 + addb $16,%r15b + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + + + + + + + + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm6 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + + + + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm15,%zmm10,%zmm26 + vpxorq %zmm12,%zmm6,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + + + + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + + + + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1) + vpshufb %zmm29,%zmm17,%zmm0 + vpshufb %zmm29,%zmm19,%zmm3 + vpshufb %zmm29,%zmm20,%zmm4 + vpshufb %zmm29,%zmm21,%zmm5 + vmovdqa64 %zmm0,1280(%rsp) + vmovdqa64 %zmm3,1344(%rsp) + vmovdqa64 %zmm4,1408(%rsp) + vmovdqa64 %zmm5,1472(%rsp) + cmpb $240,%r15b + jae .L_16_blocks_overflow_hGznvbxlbulnqGf + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_hGznvbxlbulnqGf +.L_16_blocks_overflow_hGznvbxlbulnqGf: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_hGznvbxlbulnqGf: + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 256(%rsp),%zmm1 + + + + + vshufi64x2 $255,%zmm5,%zmm5,%zmm2 + addb $16,%r15b + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 320(%rsp),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + + + + + + + + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 384(%rsp),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 448(%rsp),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm6 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + + + + vmovdqu8 256(%rcx,%r11,1),%zmm17 + vmovdqu8 320(%rcx,%r11,1),%zmm19 + vmovdqu8 384(%rcx,%r11,1),%zmm20 + vmovdqu8 448(%rcx,%r11,1),%zmm21 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vpternlogq $0x96,%zmm12,%zmm6,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + + + + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + + + + movq %r9,%r10 + vmovdqu8 %zmm0,256(%r10,%r11,1) + vmovdqu8 %zmm3,320(%r10,%r11,1) + vmovdqu8 %zmm4,384(%r10,%r11,1) + vmovdqu8 %zmm5,448(%r10,%r11,1) + vpshufb %zmm29,%zmm17,%zmm0 + vpshufb %zmm29,%zmm19,%zmm3 + vpshufb %zmm29,%zmm20,%zmm4 + vpshufb %zmm29,%zmm21,%zmm5 + vmovdqa64 %zmm0,768(%rsp) + vmovdqa64 %zmm3,832(%rsp) + vmovdqa64 %zmm4,896(%rsp) + vmovdqa64 %zmm5,960(%rsp) + cmpb $240,%r15b + jae .L_16_blocks_overflow_hikcfykasilniFs + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_hikcfykasilniFs +.L_16_blocks_overflow_hikcfykasilniFs: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_hikcfykasilniFs: + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + + + + + vshufi64x2 $255,%zmm5,%zmm5,%zmm2 + addb $16,%r15b + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + + + + + + + + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm6 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + + + + vmovdqu8 512(%rcx,%r11,1),%zmm17 + vmovdqu8 576(%rcx,%r11,1),%zmm19 + vmovdqu8 640(%rcx,%r11,1),%zmm20 + vmovdqu8 704(%rcx,%r11,1),%zmm21 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + + + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vpternlogq $0x96,%zmm15,%zmm12,%zmm6 + vpxorq %zmm24,%zmm6,%zmm6 + vpternlogq $0x96,%zmm10,%zmm13,%zmm7 + vpxorq %zmm25,%zmm7,%zmm7 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vextracti64x4 $1,%zmm6,%ymm12 + vpxorq %ymm12,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm12 + vpxorq %xmm12,%xmm6,%xmm6 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm6 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + + + + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + + + + movq %r9,%r10 + vmovdqu8 %zmm0,512(%r10,%r11,1) + vmovdqu8 %zmm3,576(%r10,%r11,1) + vmovdqu8 %zmm4,640(%r10,%r11,1) + vmovdqu8 %zmm5,704(%r10,%r11,1) + vpshufb %zmm29,%zmm17,%zmm0 + vpshufb %zmm29,%zmm19,%zmm3 + vpshufb %zmm29,%zmm20,%zmm4 + vpshufb %zmm29,%zmm21,%zmm5 + vmovdqa64 %zmm0,1024(%rsp) + vmovdqa64 %zmm3,1088(%rsp) + vmovdqa64 %zmm4,1152(%rsp) + vmovdqa64 %zmm5,1216(%rsp) + vmovdqa64 %zmm6,%zmm14 + + addq $768,%r11 + subq $768,%r8 + cmpq $768,%r8 + jae .L_encrypt_big_nblocks_brADimEeCnCcDmv + +.L_no_more_big_nblocks_brADimEeCnCcDmv: + + cmpq $512,%r8 + jae .L_encrypt_32_blocks_brADimEeCnCcDmv + + cmpq $256,%r8 + jae .L_encrypt_16_blocks_brADimEeCnCcDmv +.L_encrypt_0_blocks_ghash_32_brADimEeCnCcDmv: + movl %r8d,%r10d + andl $~15,%r10d + movl $256,%ebx + subl %r10d,%ebx + vmovdqa64 768(%rsp),%zmm13 + vpxorq %zmm14,%zmm13,%zmm13 + vmovdqu64 0(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 832(%rsp),%zmm13 + vmovdqu64 64(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpxorq %zmm10,%zmm4,%zmm26 + vpxorq %zmm6,%zmm0,%zmm24 + vpxorq %zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 896(%rsp),%zmm13 + vmovdqu64 128(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 960(%rsp),%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + addl $256,%ebx + movl %r8d,%r10d + addl $15,%r10d + shrl $4,%r10d + je .L_last_num_blocks_is_0_xyDAiCmaAhzpydl + + cmpl $8,%r10d + je .L_last_num_blocks_is_8_xyDAiCmaAhzpydl + jb .L_last_num_blocks_is_7_1_xyDAiCmaAhzpydl + + + cmpl $12,%r10d + je .L_last_num_blocks_is_12_xyDAiCmaAhzpydl + jb .L_last_num_blocks_is_11_9_xyDAiCmaAhzpydl + + + cmpl $15,%r10d + je .L_last_num_blocks_is_15_xyDAiCmaAhzpydl + ja .L_last_num_blocks_is_16_xyDAiCmaAhzpydl + cmpl $14,%r10d + je .L_last_num_blocks_is_14_xyDAiCmaAhzpydl + jmp .L_last_num_blocks_is_13_xyDAiCmaAhzpydl + +.L_last_num_blocks_is_11_9_xyDAiCmaAhzpydl: + + cmpl $10,%r10d + je .L_last_num_blocks_is_10_xyDAiCmaAhzpydl + ja .L_last_num_blocks_is_11_xyDAiCmaAhzpydl + jmp .L_last_num_blocks_is_9_xyDAiCmaAhzpydl + +.L_last_num_blocks_is_7_1_xyDAiCmaAhzpydl: + cmpl $4,%r10d + je .L_last_num_blocks_is_4_xyDAiCmaAhzpydl + jb .L_last_num_blocks_is_3_1_xyDAiCmaAhzpydl + + cmpl $6,%r10d + ja .L_last_num_blocks_is_7_xyDAiCmaAhzpydl + je .L_last_num_blocks_is_6_xyDAiCmaAhzpydl + jmp .L_last_num_blocks_is_5_xyDAiCmaAhzpydl + +.L_last_num_blocks_is_3_1_xyDAiCmaAhzpydl: + + cmpl $2,%r10d + ja .L_last_num_blocks_is_3_xyDAiCmaAhzpydl + je .L_last_num_blocks_is_2_xyDAiCmaAhzpydl +.L_last_num_blocks_is_1_xyDAiCmaAhzpydl: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $255,%r15d + jae .L_16_blocks_overflow_fyDzBrphsGjubgG + vpaddd %xmm28,%xmm2,%xmm0 + jmp .L_16_blocks_ok_fyDzBrphsGjubgG + +.L_16_blocks_overflow_fyDzBrphsGjubgG: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %xmm29,%xmm0,%xmm0 +.L_16_blocks_ok_fyDzBrphsGjubgG: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z} + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vaesenclast %xmm30,%xmm0,%xmm0 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti32x4 $0,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %xmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %xmm29,%xmm17,%xmm17 + vextracti32x4 $0,%zmm17,%xmm7 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_vtxqFwAgrdnllzF + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_vtxqFwAgrdnllzF +.L_small_initial_partial_block_vtxqFwAgrdnllzF: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm0 + + + vpclmulqdq $0x01,%xmm25,%xmm0,%xmm3 + vpslldq $8,%xmm3,%xmm3 + vpxorq %xmm3,%xmm25,%xmm3 + + + vpclmulqdq $0x00,%xmm3,%xmm0,%xmm4 + vpsrldq $4,%xmm4,%xmm4 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm4,%xmm14 + + + + + + + + + + + + + vpxorq %xmm7,%xmm14,%xmm14 + + jmp .L_after_reduction_vtxqFwAgrdnllzF +.L_small_initial_compute_done_vtxqFwAgrdnllzF: +.L_after_reduction_vtxqFwAgrdnllzF: + jmp .L_last_blocks_done_xyDAiCmaAhzpydl +.L_last_num_blocks_is_2_xyDAiCmaAhzpydl: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $254,%r15d + jae .L_16_blocks_overflow_BugDrclgtxGysBC + vpaddd %ymm28,%ymm2,%ymm0 + jmp .L_16_blocks_ok_BugDrclgtxGysBC + +.L_16_blocks_overflow_BugDrclgtxGysBC: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %ymm29,%ymm0,%ymm0 +.L_16_blocks_ok_BugDrclgtxGysBC: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z} + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vaesenclast %ymm30,%ymm0,%ymm0 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %ymm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %ymm29,%ymm17,%ymm17 + vextracti32x4 $1,%zmm17,%xmm7 + subq $16 * (2 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_dwpAvxknFwdDaDi + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_dwpAvxknFwdDaDi +.L_small_initial_partial_block_dwpAvxknFwdDaDi: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_dwpAvxknFwdDaDi: + + orq %r8,%r8 + je .L_after_reduction_dwpAvxknFwdDaDi + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_dwpAvxknFwdDaDi: + jmp .L_last_blocks_done_xyDAiCmaAhzpydl +.L_last_num_blocks_is_3_xyDAiCmaAhzpydl: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $253,%r15d + jae .L_16_blocks_overflow_xznshBaaivCChih + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_xznshBaaivCChih + +.L_16_blocks_overflow_xznshBaaivCChih: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_xznshBaaivCChih: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $2,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vextracti32x4 $2,%zmm17,%xmm7 + subq $16 * (3 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_ltvboeEneeszwsu + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_ltvboeEneeszwsu +.L_small_initial_partial_block_ltvboeEneeszwsu: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_ltvboeEneeszwsu: + + orq %r8,%r8 + je .L_after_reduction_ltvboeEneeszwsu + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_ltvboeEneeszwsu: + jmp .L_last_blocks_done_xyDAiCmaAhzpydl +.L_last_num_blocks_is_4_xyDAiCmaAhzpydl: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $252,%r15d + jae .L_16_blocks_overflow_ofErewxunpEhuze + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_ofErewxunpEhuze + +.L_16_blocks_overflow_ofErewxunpEhuze: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_ofErewxunpEhuze: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $3,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vextracti32x4 $3,%zmm17,%xmm7 + subq $16 * (4 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_mdwrrkghGswontC + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_mdwrrkghGswontC +.L_small_initial_partial_block_mdwrrkghGswontC: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_mdwrrkghGswontC: + + orq %r8,%r8 + je .L_after_reduction_mdwrrkghGswontC + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_mdwrrkghGswontC: + jmp .L_last_blocks_done_xyDAiCmaAhzpydl +.L_last_num_blocks_is_5_xyDAiCmaAhzpydl: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $251,%r15d + jae .L_16_blocks_overflow_vlFDjDvkCmipDjj + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %xmm27,%xmm0,%xmm3 + jmp .L_16_blocks_ok_vlFDjDvkCmipDjj + +.L_16_blocks_overflow_vlFDjDvkCmipDjj: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %xmm29,%xmm3,%xmm3 +.L_16_blocks_ok_vlFDjDvkCmipDjj: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %xmm30,%xmm3,%xmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %xmm19,%xmm3,%xmm3 + vextracti32x4 $0,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %xmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %xmm29,%xmm19,%xmm19 + vextracti32x4 $0,%zmm19,%xmm7 + subq $16 * (5 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_vyyfueCAnBpziso + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_vyyfueCAnBpziso +.L_small_initial_partial_block_vyyfueCAnBpziso: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_vyyfueCAnBpziso: + + orq %r8,%r8 + je .L_after_reduction_vyyfueCAnBpziso + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_vyyfueCAnBpziso: + jmp .L_last_blocks_done_xyDAiCmaAhzpydl +.L_last_num_blocks_is_6_xyDAiCmaAhzpydl: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $250,%r15d + jae .L_16_blocks_overflow_swonEtcpnChuzwe + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %ymm27,%ymm0,%ymm3 + jmp .L_16_blocks_ok_swonEtcpnChuzwe + +.L_16_blocks_overflow_swonEtcpnChuzwe: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %ymm29,%ymm3,%ymm3 +.L_16_blocks_ok_swonEtcpnChuzwe: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %ymm30,%ymm3,%ymm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %ymm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %ymm29,%ymm19,%ymm19 + vextracti32x4 $1,%zmm19,%xmm7 + subq $16 * (6 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_aEryhnaxCjcvalc + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_aEryhnaxCjcvalc +.L_small_initial_partial_block_aEryhnaxCjcvalc: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_aEryhnaxCjcvalc: + + orq %r8,%r8 + je .L_after_reduction_aEryhnaxCjcvalc + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_aEryhnaxCjcvalc: + jmp .L_last_blocks_done_xyDAiCmaAhzpydl +.L_last_num_blocks_is_7_xyDAiCmaAhzpydl: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $249,%r15d + jae .L_16_blocks_overflow_EGhejzspzceoDrz + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_EGhejzspzceoDrz + +.L_16_blocks_overflow_EGhejzspzceoDrz: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_EGhejzspzceoDrz: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $2,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vextracti32x4 $2,%zmm19,%xmm7 + subq $16 * (7 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_lcrbhrsFEemAseF + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_lcrbhrsFEemAseF +.L_small_initial_partial_block_lcrbhrsFEemAseF: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_lcrbhrsFEemAseF: + + orq %r8,%r8 + je .L_after_reduction_lcrbhrsFEemAseF + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_lcrbhrsFEemAseF: + jmp .L_last_blocks_done_xyDAiCmaAhzpydl +.L_last_num_blocks_is_8_xyDAiCmaAhzpydl: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $248,%r15d + jae .L_16_blocks_overflow_bwyfeoBaojvbAgd + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_bwyfeoBaojvbAgd + +.L_16_blocks_overflow_bwyfeoBaojvbAgd: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_bwyfeoBaojvbAgd: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $3,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vextracti32x4 $3,%zmm19,%xmm7 + subq $16 * (8 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_osycqepyfDlatEs + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_osycqepyfDlatEs +.L_small_initial_partial_block_osycqepyfDlatEs: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_osycqepyfDlatEs: + + orq %r8,%r8 + je .L_after_reduction_osycqepyfDlatEs + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_osycqepyfDlatEs: + jmp .L_last_blocks_done_xyDAiCmaAhzpydl +.L_last_num_blocks_is_9_xyDAiCmaAhzpydl: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $247,%r15d + jae .L_16_blocks_overflow_BaoGkpEpCdeyrev + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %xmm27,%xmm3,%xmm4 + jmp .L_16_blocks_ok_BaoGkpEpCdeyrev + +.L_16_blocks_overflow_BaoGkpEpCdeyrev: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %xmm29,%xmm4,%xmm4 +.L_16_blocks_ok_BaoGkpEpCdeyrev: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %xmm30,%xmm4,%xmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %xmm20,%xmm4,%xmm4 + vextracti32x4 $0,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %xmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %xmm29,%xmm20,%xmm20 + vextracti32x4 $0,%zmm20,%xmm7 + subq $16 * (9 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_ilsvshcinsdmttt + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_ilsvshcinsdmttt +.L_small_initial_partial_block_ilsvshcinsdmttt: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_ilsvshcinsdmttt: + + orq %r8,%r8 + je .L_after_reduction_ilsvshcinsdmttt + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_ilsvshcinsdmttt: + jmp .L_last_blocks_done_xyDAiCmaAhzpydl +.L_last_num_blocks_is_10_xyDAiCmaAhzpydl: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $246,%r15d + jae .L_16_blocks_overflow_sAtxBaaxwaffire + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %ymm27,%ymm3,%ymm4 + jmp .L_16_blocks_ok_sAtxBaaxwaffire + +.L_16_blocks_overflow_sAtxBaaxwaffire: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %ymm29,%ymm4,%ymm4 +.L_16_blocks_ok_sAtxBaaxwaffire: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %ymm30,%ymm4,%ymm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %ymm20,%ymm4,%ymm4 + vextracti32x4 $1,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %ymm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %ymm29,%ymm20,%ymm20 + vextracti32x4 $1,%zmm20,%xmm7 + subq $16 * (10 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_mAgwqklangGkxiD + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_mAgwqklangGkxiD +.L_small_initial_partial_block_mAgwqklangGkxiD: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_mAgwqklangGkxiD: + + orq %r8,%r8 + je .L_after_reduction_mAgwqklangGkxiD + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_mAgwqklangGkxiD: + jmp .L_last_blocks_done_xyDAiCmaAhzpydl +.L_last_num_blocks_is_11_xyDAiCmaAhzpydl: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $245,%r15d + jae .L_16_blocks_overflow_ditvbyzmFxiaFex + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_ditvbyzmFxiaFex + +.L_16_blocks_overflow_ditvbyzmFxiaFex: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_ditvbyzmFxiaFex: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $2,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vextracti32x4 $2,%zmm20,%xmm7 + subq $16 * (11 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_hnpDdEkCCcoeFCy + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_hnpDdEkCCcoeFCy +.L_small_initial_partial_block_hnpDdEkCCcoeFCy: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_hnpDdEkCCcoeFCy: + + orq %r8,%r8 + je .L_after_reduction_hnpDdEkCCcoeFCy + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_hnpDdEkCCcoeFCy: + jmp .L_last_blocks_done_xyDAiCmaAhzpydl +.L_last_num_blocks_is_12_xyDAiCmaAhzpydl: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $244,%r15d + jae .L_16_blocks_overflow_iDaEpwpdhbvwFws + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_iDaEpwpdhbvwFws + +.L_16_blocks_overflow_iDaEpwpdhbvwFws: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_iDaEpwpdhbvwFws: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $3,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vextracti32x4 $3,%zmm20,%xmm7 + subq $16 * (12 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_vFCCocfxfdGyktw + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_vFCCocfxfdGyktw +.L_small_initial_partial_block_vFCCocfxfdGyktw: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_vFCCocfxfdGyktw: + + orq %r8,%r8 + je .L_after_reduction_vFCCocfxfdGyktw + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_vFCCocfxfdGyktw: + jmp .L_last_blocks_done_xyDAiCmaAhzpydl +.L_last_num_blocks_is_13_xyDAiCmaAhzpydl: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $243,%r15d + jae .L_16_blocks_overflow_ossjtlatrhiigng + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %xmm27,%xmm4,%xmm5 + jmp .L_16_blocks_ok_ossjtlatrhiigng + +.L_16_blocks_overflow_ossjtlatrhiigng: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %xmm29,%xmm5,%xmm5 +.L_16_blocks_ok_ossjtlatrhiigng: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %xmm30,%xmm5,%xmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %xmm21,%xmm5,%xmm5 + vextracti32x4 $0,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %xmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %xmm29,%xmm21,%xmm21 + vextracti32x4 $0,%zmm21,%xmm7 + subq $16 * (13 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_CiuBkutmcuwgEdD + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_CiuBkutmcuwgEdD +.L_small_initial_partial_block_CiuBkutmcuwgEdD: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_CiuBkutmcuwgEdD: + + orq %r8,%r8 + je .L_after_reduction_CiuBkutmcuwgEdD + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_CiuBkutmcuwgEdD: + jmp .L_last_blocks_done_xyDAiCmaAhzpydl +.L_last_num_blocks_is_14_xyDAiCmaAhzpydl: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $242,%r15d + jae .L_16_blocks_overflow_vocABmmphunBotn + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %ymm27,%ymm4,%ymm5 + jmp .L_16_blocks_ok_vocABmmphunBotn + +.L_16_blocks_overflow_vocABmmphunBotn: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %ymm29,%ymm5,%ymm5 +.L_16_blocks_ok_vocABmmphunBotn: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %ymm30,%ymm5,%ymm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %ymm21,%ymm5,%ymm5 + vextracti32x4 $1,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %ymm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %ymm29,%ymm21,%ymm21 + vextracti32x4 $1,%zmm21,%xmm7 + subq $16 * (14 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_xoGwditlthtdCzd + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_xoGwditlthtdCzd +.L_small_initial_partial_block_xoGwditlthtdCzd: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_xoGwditlthtdCzd: + + orq %r8,%r8 + je .L_after_reduction_xoGwditlthtdCzd + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_xoGwditlthtdCzd: + jmp .L_last_blocks_done_xyDAiCmaAhzpydl +.L_last_num_blocks_is_15_xyDAiCmaAhzpydl: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $241,%r15d + jae .L_16_blocks_overflow_jbcAwazvdrBjhzu + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_jbcAwazvdrBjhzu + +.L_16_blocks_overflow_jbcAwazvdrBjhzu: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_jbcAwazvdrBjhzu: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $2,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %zmm29,%zmm21,%zmm21 + vextracti32x4 $2,%zmm21,%xmm7 + subq $16 * (15 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_eohjglCqsfjlesq + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_eohjglCqsfjlesq +.L_small_initial_partial_block_eohjglCqsfjlesq: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_eohjglCqsfjlesq: + + orq %r8,%r8 + je .L_after_reduction_eohjglCqsfjlesq + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_eohjglCqsfjlesq: + jmp .L_last_blocks_done_xyDAiCmaAhzpydl +.L_last_num_blocks_is_16_xyDAiCmaAhzpydl: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $240,%r15d + jae .L_16_blocks_overflow_uatdhlpChpnBofk + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_uatdhlpChpnBofk + +.L_16_blocks_overflow_uatdhlpChpnBofk: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_uatdhlpChpnBofk: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $3,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %zmm29,%zmm21,%zmm21 + vextracti32x4 $3,%zmm21,%xmm7 + subq $16 * (16 - 1),%r8 +.L_small_initial_partial_block_uvEqevkuejAoeFv: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_uvEqevkuejAoeFv: + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_uvEqevkuejAoeFv: + jmp .L_last_blocks_done_xyDAiCmaAhzpydl +.L_last_num_blocks_is_0_xyDAiCmaAhzpydl: + vmovdqa64 1024(%rsp),%zmm13 + vmovdqu64 0(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1088(%rsp),%zmm13 + vmovdqu64 64(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 1152(%rsp),%zmm13 + vmovdqu64 128(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1216(%rsp),%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm4 + + + vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0 + vpslldq $8,%xmm0,%xmm0 + vpxorq %xmm0,%xmm25,%xmm0 + + + vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3 + vpsrldq $4,%xmm3,%xmm3 + vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm3,%xmm14 + +.L_last_blocks_done_xyDAiCmaAhzpydl: + vpshufb %xmm29,%xmm2,%xmm2 + jmp .L_ghash_done_brADimEeCnCcDmv +.L_encrypt_32_blocks_brADimEeCnCcDmv: + cmpb $240,%r15b + jae .L_16_blocks_overflow_brlCzGBjhaqyEcd + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_brlCzGBjhaqyEcd +.L_16_blocks_overflow_brlCzGBjhaqyEcd: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_brlCzGBjhaqyEcd: + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp),%zmm1 + + + + + vshufi64x2 $255,%zmm5,%zmm5,%zmm2 + addb $16,%r15b + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + + + + + + + + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm6 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + + + + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm15,%zmm10,%zmm26 + vpxorq %zmm12,%zmm6,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + + + + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + + + + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1) + vpshufb %zmm29,%zmm17,%zmm0 + vpshufb %zmm29,%zmm19,%zmm3 + vpshufb %zmm29,%zmm20,%zmm4 + vpshufb %zmm29,%zmm21,%zmm5 + vmovdqa64 %zmm0,1280(%rsp) + vmovdqa64 %zmm3,1344(%rsp) + vmovdqa64 %zmm4,1408(%rsp) + vmovdqa64 %zmm5,1472(%rsp) + cmpb $240,%r15b + jae .L_16_blocks_overflow_pchieDggcEipdhz + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_pchieDggcEipdhz +.L_16_blocks_overflow_pchieDggcEipdhz: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_pchieDggcEipdhz: + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 256(%rsp),%zmm1 + + + + + vshufi64x2 $255,%zmm5,%zmm5,%zmm2 + addb $16,%r15b + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 320(%rsp),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + + + + + + + + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 384(%rsp),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 448(%rsp),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm6 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + + + + vmovdqu8 256(%rcx,%r11,1),%zmm17 + vmovdqu8 320(%rcx,%r11,1),%zmm19 + vmovdqu8 384(%rcx,%r11,1),%zmm20 + vmovdqu8 448(%rcx,%r11,1),%zmm21 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vpternlogq $0x96,%zmm12,%zmm6,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + + + + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + + + + movq %r9,%r10 + vmovdqu8 %zmm0,256(%r10,%r11,1) + vmovdqu8 %zmm3,320(%r10,%r11,1) + vmovdqu8 %zmm4,384(%r10,%r11,1) + vmovdqu8 %zmm5,448(%r10,%r11,1) + vpshufb %zmm29,%zmm17,%zmm0 + vpshufb %zmm29,%zmm19,%zmm3 + vpshufb %zmm29,%zmm20,%zmm4 + vpshufb %zmm29,%zmm21,%zmm5 + vmovdqa64 %zmm0,768(%rsp) + vmovdqa64 %zmm3,832(%rsp) + vmovdqa64 %zmm4,896(%rsp) + vmovdqa64 %zmm5,960(%rsp) + vmovdqa64 1280(%rsp),%zmm13 + vmovdqu64 512(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1344(%rsp),%zmm13 + vmovdqu64 576(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 1408(%rsp),%zmm13 + vmovdqu64 640(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1472(%rsp),%zmm13 + vmovdqu64 704(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm4 + + + vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0 + vpslldq $8,%xmm0,%xmm0 + vpxorq %xmm0,%xmm25,%xmm0 + + + vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3 + vpsrldq $4,%xmm3,%xmm3 + vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm3,%xmm14 + + subq $512,%r8 + addq $512,%r11 + movl %r8d,%r10d + andl $~15,%r10d + movl $512,%ebx + subl %r10d,%ebx + movl %r8d,%r10d + addl $15,%r10d + shrl $4,%r10d + je .L_last_num_blocks_is_0_digsBljoDvGeopi + + cmpl $8,%r10d + je .L_last_num_blocks_is_8_digsBljoDvGeopi + jb .L_last_num_blocks_is_7_1_digsBljoDvGeopi + + + cmpl $12,%r10d + je .L_last_num_blocks_is_12_digsBljoDvGeopi + jb .L_last_num_blocks_is_11_9_digsBljoDvGeopi + + + cmpl $15,%r10d + je .L_last_num_blocks_is_15_digsBljoDvGeopi + ja .L_last_num_blocks_is_16_digsBljoDvGeopi + cmpl $14,%r10d + je .L_last_num_blocks_is_14_digsBljoDvGeopi + jmp .L_last_num_blocks_is_13_digsBljoDvGeopi + +.L_last_num_blocks_is_11_9_digsBljoDvGeopi: + + cmpl $10,%r10d + je .L_last_num_blocks_is_10_digsBljoDvGeopi + ja .L_last_num_blocks_is_11_digsBljoDvGeopi + jmp .L_last_num_blocks_is_9_digsBljoDvGeopi + +.L_last_num_blocks_is_7_1_digsBljoDvGeopi: + cmpl $4,%r10d + je .L_last_num_blocks_is_4_digsBljoDvGeopi + jb .L_last_num_blocks_is_3_1_digsBljoDvGeopi + + cmpl $6,%r10d + ja .L_last_num_blocks_is_7_digsBljoDvGeopi + je .L_last_num_blocks_is_6_digsBljoDvGeopi + jmp .L_last_num_blocks_is_5_digsBljoDvGeopi + +.L_last_num_blocks_is_3_1_digsBljoDvGeopi: + + cmpl $2,%r10d + ja .L_last_num_blocks_is_3_digsBljoDvGeopi + je .L_last_num_blocks_is_2_digsBljoDvGeopi +.L_last_num_blocks_is_1_digsBljoDvGeopi: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $255,%r15d + jae .L_16_blocks_overflow_eopubcfobBxhpzt + vpaddd %xmm28,%xmm2,%xmm0 + jmp .L_16_blocks_ok_eopubcfobBxhpzt + +.L_16_blocks_overflow_eopubcfobBxhpzt: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %xmm29,%xmm0,%xmm0 +.L_16_blocks_ok_eopubcfobBxhpzt: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z} + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vaesenclast %xmm30,%xmm0,%xmm0 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti32x4 $0,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %xmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %xmm29,%xmm17,%xmm17 + vextracti32x4 $0,%zmm17,%xmm7 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_GethbnvGqcjphdB + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_GethbnvGqcjphdB +.L_small_initial_partial_block_GethbnvGqcjphdB: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm0 + + + vpclmulqdq $0x01,%xmm25,%xmm0,%xmm3 + vpslldq $8,%xmm3,%xmm3 + vpxorq %xmm3,%xmm25,%xmm3 + + + vpclmulqdq $0x00,%xmm3,%xmm0,%xmm4 + vpsrldq $4,%xmm4,%xmm4 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm4,%xmm14 + + + + + + + + + + + + + vpxorq %xmm7,%xmm14,%xmm14 + + jmp .L_after_reduction_GethbnvGqcjphdB +.L_small_initial_compute_done_GethbnvGqcjphdB: +.L_after_reduction_GethbnvGqcjphdB: + jmp .L_last_blocks_done_digsBljoDvGeopi +.L_last_num_blocks_is_2_digsBljoDvGeopi: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $254,%r15d + jae .L_16_blocks_overflow_tpsnzcptGBjneak + vpaddd %ymm28,%ymm2,%ymm0 + jmp .L_16_blocks_ok_tpsnzcptGBjneak + +.L_16_blocks_overflow_tpsnzcptGBjneak: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %ymm29,%ymm0,%ymm0 +.L_16_blocks_ok_tpsnzcptGBjneak: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z} + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vaesenclast %ymm30,%ymm0,%ymm0 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %ymm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %ymm29,%ymm17,%ymm17 + vextracti32x4 $1,%zmm17,%xmm7 + subq $16 * (2 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_xzAlvFvGbtFmqjz + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_xzAlvFvGbtFmqjz +.L_small_initial_partial_block_xzAlvFvGbtFmqjz: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_xzAlvFvGbtFmqjz: + + orq %r8,%r8 + je .L_after_reduction_xzAlvFvGbtFmqjz + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_xzAlvFvGbtFmqjz: + jmp .L_last_blocks_done_digsBljoDvGeopi +.L_last_num_blocks_is_3_digsBljoDvGeopi: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $253,%r15d + jae .L_16_blocks_overflow_lirgnnkvzmitoxw + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_lirgnnkvzmitoxw + +.L_16_blocks_overflow_lirgnnkvzmitoxw: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_lirgnnkvzmitoxw: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $2,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vextracti32x4 $2,%zmm17,%xmm7 + subq $16 * (3 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_ovClAwtFzFgwrxE + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_ovClAwtFzFgwrxE +.L_small_initial_partial_block_ovClAwtFzFgwrxE: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_ovClAwtFzFgwrxE: + + orq %r8,%r8 + je .L_after_reduction_ovClAwtFzFgwrxE + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_ovClAwtFzFgwrxE: + jmp .L_last_blocks_done_digsBljoDvGeopi +.L_last_num_blocks_is_4_digsBljoDvGeopi: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $252,%r15d + jae .L_16_blocks_overflow_xgCtemAejdionch + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_xgCtemAejdionch + +.L_16_blocks_overflow_xgCtemAejdionch: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_xgCtemAejdionch: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $3,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vextracti32x4 $3,%zmm17,%xmm7 + subq $16 * (4 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_iEyBjAGEhdmCFpz + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_iEyBjAGEhdmCFpz +.L_small_initial_partial_block_iEyBjAGEhdmCFpz: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_iEyBjAGEhdmCFpz: + + orq %r8,%r8 + je .L_after_reduction_iEyBjAGEhdmCFpz + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_iEyBjAGEhdmCFpz: + jmp .L_last_blocks_done_digsBljoDvGeopi +.L_last_num_blocks_is_5_digsBljoDvGeopi: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $251,%r15d + jae .L_16_blocks_overflow_eojywxfxbxGnElA + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %xmm27,%xmm0,%xmm3 + jmp .L_16_blocks_ok_eojywxfxbxGnElA + +.L_16_blocks_overflow_eojywxfxbxGnElA: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %xmm29,%xmm3,%xmm3 +.L_16_blocks_ok_eojywxfxbxGnElA: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %xmm30,%xmm3,%xmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %xmm19,%xmm3,%xmm3 + vextracti32x4 $0,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %xmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %xmm29,%xmm19,%xmm19 + vextracti32x4 $0,%zmm19,%xmm7 + subq $16 * (5 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_xzyrfzavvdvxobt + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_xzyrfzavvdvxobt +.L_small_initial_partial_block_xzyrfzavvdvxobt: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_xzyrfzavvdvxobt: + + orq %r8,%r8 + je .L_after_reduction_xzyrfzavvdvxobt + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_xzyrfzavvdvxobt: + jmp .L_last_blocks_done_digsBljoDvGeopi +.L_last_num_blocks_is_6_digsBljoDvGeopi: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $250,%r15d + jae .L_16_blocks_overflow_fefwvFrCitcygrh + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %ymm27,%ymm0,%ymm3 + jmp .L_16_blocks_ok_fefwvFrCitcygrh + +.L_16_blocks_overflow_fefwvFrCitcygrh: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %ymm29,%ymm3,%ymm3 +.L_16_blocks_ok_fefwvFrCitcygrh: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %ymm30,%ymm3,%ymm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %ymm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %ymm29,%ymm19,%ymm19 + vextracti32x4 $1,%zmm19,%xmm7 + subq $16 * (6 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_EGwsgDahgpEisFa + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_EGwsgDahgpEisFa +.L_small_initial_partial_block_EGwsgDahgpEisFa: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_EGwsgDahgpEisFa: + + orq %r8,%r8 + je .L_after_reduction_EGwsgDahgpEisFa + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_EGwsgDahgpEisFa: + jmp .L_last_blocks_done_digsBljoDvGeopi +.L_last_num_blocks_is_7_digsBljoDvGeopi: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $249,%r15d + jae .L_16_blocks_overflow_GiAftkxuDrwByoy + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_GiAftkxuDrwByoy + +.L_16_blocks_overflow_GiAftkxuDrwByoy: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_GiAftkxuDrwByoy: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $2,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vextracti32x4 $2,%zmm19,%xmm7 + subq $16 * (7 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_pvtnwvrCesGFzzt + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_pvtnwvrCesGFzzt +.L_small_initial_partial_block_pvtnwvrCesGFzzt: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_pvtnwvrCesGFzzt: + + orq %r8,%r8 + je .L_after_reduction_pvtnwvrCesGFzzt + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_pvtnwvrCesGFzzt: + jmp .L_last_blocks_done_digsBljoDvGeopi +.L_last_num_blocks_is_8_digsBljoDvGeopi: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $248,%r15d + jae .L_16_blocks_overflow_fdotfBFcguDtbBo + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_fdotfBFcguDtbBo + +.L_16_blocks_overflow_fdotfBFcguDtbBo: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_fdotfBFcguDtbBo: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $3,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vextracti32x4 $3,%zmm19,%xmm7 + subq $16 * (8 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_wvodhAGehoxjCmp + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_wvodhAGehoxjCmp +.L_small_initial_partial_block_wvodhAGehoxjCmp: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_wvodhAGehoxjCmp: + + orq %r8,%r8 + je .L_after_reduction_wvodhAGehoxjCmp + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_wvodhAGehoxjCmp: + jmp .L_last_blocks_done_digsBljoDvGeopi +.L_last_num_blocks_is_9_digsBljoDvGeopi: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $247,%r15d + jae .L_16_blocks_overflow_GcksGDvymbkGaeh + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %xmm27,%xmm3,%xmm4 + jmp .L_16_blocks_ok_GcksGDvymbkGaeh + +.L_16_blocks_overflow_GcksGDvymbkGaeh: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %xmm29,%xmm4,%xmm4 +.L_16_blocks_ok_GcksGDvymbkGaeh: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %xmm30,%xmm4,%xmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %xmm20,%xmm4,%xmm4 + vextracti32x4 $0,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %xmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %xmm29,%xmm20,%xmm20 + vextracti32x4 $0,%zmm20,%xmm7 + subq $16 * (9 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_uqlihfyhxyhihvk + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_uqlihfyhxyhihvk +.L_small_initial_partial_block_uqlihfyhxyhihvk: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_uqlihfyhxyhihvk: + + orq %r8,%r8 + je .L_after_reduction_uqlihfyhxyhihvk + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_uqlihfyhxyhihvk: + jmp .L_last_blocks_done_digsBljoDvGeopi +.L_last_num_blocks_is_10_digsBljoDvGeopi: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $246,%r15d + jae .L_16_blocks_overflow_bjDavzoezpzksBl + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %ymm27,%ymm3,%ymm4 + jmp .L_16_blocks_ok_bjDavzoezpzksBl + +.L_16_blocks_overflow_bjDavzoezpzksBl: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %ymm29,%ymm4,%ymm4 +.L_16_blocks_ok_bjDavzoezpzksBl: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %ymm30,%ymm4,%ymm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %ymm20,%ymm4,%ymm4 + vextracti32x4 $1,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %ymm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %ymm29,%ymm20,%ymm20 + vextracti32x4 $1,%zmm20,%xmm7 + subq $16 * (10 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_thhwkdBkbzuszkb + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_thhwkdBkbzuszkb +.L_small_initial_partial_block_thhwkdBkbzuszkb: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_thhwkdBkbzuszkb: + + orq %r8,%r8 + je .L_after_reduction_thhwkdBkbzuszkb + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_thhwkdBkbzuszkb: + jmp .L_last_blocks_done_digsBljoDvGeopi +.L_last_num_blocks_is_11_digsBljoDvGeopi: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $245,%r15d + jae .L_16_blocks_overflow_epoBmnewvcDxoga + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_epoBmnewvcDxoga + +.L_16_blocks_overflow_epoBmnewvcDxoga: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_epoBmnewvcDxoga: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $2,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vextracti32x4 $2,%zmm20,%xmm7 + subq $16 * (11 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_xCrDaEDvhzCAvdw + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_xCrDaEDvhzCAvdw +.L_small_initial_partial_block_xCrDaEDvhzCAvdw: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_xCrDaEDvhzCAvdw: + + orq %r8,%r8 + je .L_after_reduction_xCrDaEDvhzCAvdw + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_xCrDaEDvhzCAvdw: + jmp .L_last_blocks_done_digsBljoDvGeopi +.L_last_num_blocks_is_12_digsBljoDvGeopi: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $244,%r15d + jae .L_16_blocks_overflow_jDebikuAmaaarvn + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_jDebikuAmaaarvn + +.L_16_blocks_overflow_jDebikuAmaaarvn: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_jDebikuAmaaarvn: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $3,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vextracti32x4 $3,%zmm20,%xmm7 + subq $16 * (12 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_ynohxakFGzjuDGi + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_ynohxakFGzjuDGi +.L_small_initial_partial_block_ynohxakFGzjuDGi: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_ynohxakFGzjuDGi: + + orq %r8,%r8 + je .L_after_reduction_ynohxakFGzjuDGi + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_ynohxakFGzjuDGi: + jmp .L_last_blocks_done_digsBljoDvGeopi +.L_last_num_blocks_is_13_digsBljoDvGeopi: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $243,%r15d + jae .L_16_blocks_overflow_hshekyDxCginrlC + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %xmm27,%xmm4,%xmm5 + jmp .L_16_blocks_ok_hshekyDxCginrlC + +.L_16_blocks_overflow_hshekyDxCginrlC: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %xmm29,%xmm5,%xmm5 +.L_16_blocks_ok_hshekyDxCginrlC: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %xmm30,%xmm5,%xmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %xmm21,%xmm5,%xmm5 + vextracti32x4 $0,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %xmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %xmm29,%xmm21,%xmm21 + vextracti32x4 $0,%zmm21,%xmm7 + subq $16 * (13 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_httDwjAaGCslaiE + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_httDwjAaGCslaiE +.L_small_initial_partial_block_httDwjAaGCslaiE: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_httDwjAaGCslaiE: + + orq %r8,%r8 + je .L_after_reduction_httDwjAaGCslaiE + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_httDwjAaGCslaiE: + jmp .L_last_blocks_done_digsBljoDvGeopi +.L_last_num_blocks_is_14_digsBljoDvGeopi: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $242,%r15d + jae .L_16_blocks_overflow_DrtmyDmpgCneBsy + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %ymm27,%ymm4,%ymm5 + jmp .L_16_blocks_ok_DrtmyDmpgCneBsy + +.L_16_blocks_overflow_DrtmyDmpgCneBsy: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %ymm29,%ymm5,%ymm5 +.L_16_blocks_ok_DrtmyDmpgCneBsy: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %ymm30,%ymm5,%ymm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %ymm21,%ymm5,%ymm5 + vextracti32x4 $1,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %ymm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %ymm29,%ymm21,%ymm21 + vextracti32x4 $1,%zmm21,%xmm7 + subq $16 * (14 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_fAmeqrcqmahfygz + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_fAmeqrcqmahfygz +.L_small_initial_partial_block_fAmeqrcqmahfygz: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_fAmeqrcqmahfygz: + + orq %r8,%r8 + je .L_after_reduction_fAmeqrcqmahfygz + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_fAmeqrcqmahfygz: + jmp .L_last_blocks_done_digsBljoDvGeopi +.L_last_num_blocks_is_15_digsBljoDvGeopi: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $241,%r15d + jae .L_16_blocks_overflow_jakbeEuDkermeem + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_jakbeEuDkermeem + +.L_16_blocks_overflow_jakbeEuDkermeem: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_jakbeEuDkermeem: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $2,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %zmm29,%zmm21,%zmm21 + vextracti32x4 $2,%zmm21,%xmm7 + subq $16 * (15 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_czuljoFmwduytgq + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_czuljoFmwduytgq +.L_small_initial_partial_block_czuljoFmwduytgq: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_czuljoFmwduytgq: + + orq %r8,%r8 + je .L_after_reduction_czuljoFmwduytgq + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_czuljoFmwduytgq: + jmp .L_last_blocks_done_digsBljoDvGeopi +.L_last_num_blocks_is_16_digsBljoDvGeopi: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $240,%r15d + jae .L_16_blocks_overflow_pFvBGotBaidmClB + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_pFvBGotBaidmClB + +.L_16_blocks_overflow_pFvBGotBaidmClB: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_pFvBGotBaidmClB: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $3,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %zmm29,%zmm21,%zmm21 + vextracti32x4 $3,%zmm21,%xmm7 + subq $16 * (16 - 1),%r8 +.L_small_initial_partial_block_rlrrckDhqtmvgrG: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_rlrrckDhqtmvgrG: + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_rlrrckDhqtmvgrG: + jmp .L_last_blocks_done_digsBljoDvGeopi +.L_last_num_blocks_is_0_digsBljoDvGeopi: + vmovdqa64 768(%rsp),%zmm13 + vpxorq %zmm14,%zmm13,%zmm13 + vmovdqu64 0(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 832(%rsp),%zmm13 + vmovdqu64 64(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpxorq %zmm10,%zmm4,%zmm26 + vpxorq %zmm6,%zmm0,%zmm24 + vpxorq %zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 896(%rsp),%zmm13 + vmovdqu64 128(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 960(%rsp),%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm4 + + + vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0 + vpslldq $8,%xmm0,%xmm0 + vpxorq %xmm0,%xmm25,%xmm0 + + + vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3 + vpsrldq $4,%xmm3,%xmm3 + vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm3,%xmm14 + +.L_last_blocks_done_digsBljoDvGeopi: + vpshufb %xmm29,%xmm2,%xmm2 + jmp .L_ghash_done_brADimEeCnCcDmv +.L_encrypt_16_blocks_brADimEeCnCcDmv: + cmpb $240,%r15b + jae .L_16_blocks_overflow_mBiujfnyqjDacBo + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_mBiujfnyqjDacBo +.L_16_blocks_overflow_mBiujfnyqjDacBo: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_mBiujfnyqjDacBo: + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp),%zmm1 + + + + + vshufi64x2 $255,%zmm5,%zmm5,%zmm2 + addb $16,%r15b + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + + + + + + + + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm6 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + + + + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm15,%zmm10,%zmm26 + vpxorq %zmm12,%zmm6,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + + + + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + + + + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1) + vpshufb %zmm29,%zmm17,%zmm0 + vpshufb %zmm29,%zmm19,%zmm3 + vpshufb %zmm29,%zmm20,%zmm4 + vpshufb %zmm29,%zmm21,%zmm5 + vmovdqa64 %zmm0,1280(%rsp) + vmovdqa64 %zmm3,1344(%rsp) + vmovdqa64 %zmm4,1408(%rsp) + vmovdqa64 %zmm5,1472(%rsp) + vmovdqa64 1024(%rsp),%zmm13 + vmovdqu64 256(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1088(%rsp),%zmm13 + vmovdqu64 320(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 1152(%rsp),%zmm13 + vmovdqu64 384(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1216(%rsp),%zmm13 + vmovdqu64 448(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + subq $256,%r8 + addq $256,%r11 + movl %r8d,%r10d + addl $15,%r10d + shrl $4,%r10d + je .L_last_num_blocks_is_0_hpinkedxAsgwrDG + + cmpl $8,%r10d + je .L_last_num_blocks_is_8_hpinkedxAsgwrDG + jb .L_last_num_blocks_is_7_1_hpinkedxAsgwrDG + + + cmpl $12,%r10d + je .L_last_num_blocks_is_12_hpinkedxAsgwrDG + jb .L_last_num_blocks_is_11_9_hpinkedxAsgwrDG + + + cmpl $15,%r10d + je .L_last_num_blocks_is_15_hpinkedxAsgwrDG + ja .L_last_num_blocks_is_16_hpinkedxAsgwrDG + cmpl $14,%r10d + je .L_last_num_blocks_is_14_hpinkedxAsgwrDG + jmp .L_last_num_blocks_is_13_hpinkedxAsgwrDG + +.L_last_num_blocks_is_11_9_hpinkedxAsgwrDG: + + cmpl $10,%r10d + je .L_last_num_blocks_is_10_hpinkedxAsgwrDG + ja .L_last_num_blocks_is_11_hpinkedxAsgwrDG + jmp .L_last_num_blocks_is_9_hpinkedxAsgwrDG + +.L_last_num_blocks_is_7_1_hpinkedxAsgwrDG: + cmpl $4,%r10d + je .L_last_num_blocks_is_4_hpinkedxAsgwrDG + jb .L_last_num_blocks_is_3_1_hpinkedxAsgwrDG + + cmpl $6,%r10d + ja .L_last_num_blocks_is_7_hpinkedxAsgwrDG + je .L_last_num_blocks_is_6_hpinkedxAsgwrDG + jmp .L_last_num_blocks_is_5_hpinkedxAsgwrDG + +.L_last_num_blocks_is_3_1_hpinkedxAsgwrDG: + + cmpl $2,%r10d + ja .L_last_num_blocks_is_3_hpinkedxAsgwrDG + je .L_last_num_blocks_is_2_hpinkedxAsgwrDG +.L_last_num_blocks_is_1_hpinkedxAsgwrDG: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $255,%r15d + jae .L_16_blocks_overflow_fBBmqqamxsbkcrt + vpaddd %xmm28,%xmm2,%xmm0 + jmp .L_16_blocks_ok_fBBmqqamxsbkcrt + +.L_16_blocks_overflow_fBBmqqamxsbkcrt: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %xmm29,%xmm0,%xmm0 +.L_16_blocks_ok_fBBmqqamxsbkcrt: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $0,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z} + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %xmm31,%xmm0,%xmm0 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %xmm30,%xmm0,%xmm0 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti32x4 $0,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %xmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %xmm29,%xmm17,%xmm17 + vextracti32x4 $0,%zmm17,%xmm7 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_lrfgmFpfobGvwfj + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_lrfgmFpfobGvwfj +.L_small_initial_partial_block_lrfgmFpfobGvwfj: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + + + + + + + + + + + + vpxorq %xmm7,%xmm14,%xmm14 + + jmp .L_after_reduction_lrfgmFpfobGvwfj +.L_small_initial_compute_done_lrfgmFpfobGvwfj: +.L_after_reduction_lrfgmFpfobGvwfj: + jmp .L_last_blocks_done_hpinkedxAsgwrDG +.L_last_num_blocks_is_2_hpinkedxAsgwrDG: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $254,%r15d + jae .L_16_blocks_overflow_xDanrAoaAcACiFw + vpaddd %ymm28,%ymm2,%ymm0 + jmp .L_16_blocks_ok_xDanrAoaAcACiFw + +.L_16_blocks_overflow_xDanrAoaAcACiFw: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %ymm29,%ymm0,%ymm0 +.L_16_blocks_ok_xDanrAoaAcACiFw: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $1,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z} + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %ymm31,%ymm0,%ymm0 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %ymm30,%ymm0,%ymm0 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %ymm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %ymm29,%ymm17,%ymm17 + vextracti32x4 $1,%zmm17,%xmm7 + subq $16 * (2 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_rgsstcnEqnxrxBs + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_rgsstcnEqnxrxBs +.L_small_initial_partial_block_rgsstcnEqnxrxBs: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_rgsstcnEqnxrxBs: + + orq %r8,%r8 + je .L_after_reduction_rgsstcnEqnxrxBs + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_rgsstcnEqnxrxBs: + jmp .L_last_blocks_done_hpinkedxAsgwrDG +.L_last_num_blocks_is_3_hpinkedxAsgwrDG: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $253,%r15d + jae .L_16_blocks_overflow_lrqqcheobutysur + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_lrqqcheobutysur + +.L_16_blocks_overflow_lrqqcheobutysur: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_lrqqcheobutysur: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $2,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $2,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vextracti32x4 $2,%zmm17,%xmm7 + subq $16 * (3 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_xejmrnqBpubjbjg + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_xejmrnqBpubjbjg +.L_small_initial_partial_block_xejmrnqBpubjbjg: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_xejmrnqBpubjbjg: + + orq %r8,%r8 + je .L_after_reduction_xejmrnqBpubjbjg + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_xejmrnqBpubjbjg: + jmp .L_last_blocks_done_hpinkedxAsgwrDG +.L_last_num_blocks_is_4_hpinkedxAsgwrDG: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $252,%r15d + jae .L_16_blocks_overflow_gjemvxDziwfmcyi + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_gjemvxDziwfmcyi + +.L_16_blocks_overflow_gjemvxDziwfmcyi: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_gjemvxDziwfmcyi: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $3,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $3,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vextracti32x4 $3,%zmm17,%xmm7 + subq $16 * (4 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_fCcphAbbvbdCpEo + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_fCcphAbbvbdCpEo +.L_small_initial_partial_block_fCcphAbbvbdCpEo: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_fCcphAbbvbdCpEo: + + orq %r8,%r8 + je .L_after_reduction_fCcphAbbvbdCpEo + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_fCcphAbbvbdCpEo: + jmp .L_last_blocks_done_hpinkedxAsgwrDG +.L_last_num_blocks_is_5_hpinkedxAsgwrDG: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $251,%r15d + jae .L_16_blocks_overflow_ftkjlfgrvFmBAqj + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %xmm27,%xmm0,%xmm3 + jmp .L_16_blocks_ok_ftkjlfgrvFmBAqj + +.L_16_blocks_overflow_ftkjlfgrvFmBAqj: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %xmm29,%xmm3,%xmm3 +.L_16_blocks_ok_ftkjlfgrvFmBAqj: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $0,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %xmm30,%xmm3,%xmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %xmm19,%xmm3,%xmm3 + vextracti32x4 $0,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %xmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %xmm29,%xmm19,%xmm19 + vextracti32x4 $0,%zmm19,%xmm7 + subq $16 * (5 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_GcmEpgzDnksqGvv + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_GcmEpgzDnksqGvv +.L_small_initial_partial_block_GcmEpgzDnksqGvv: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_GcmEpgzDnksqGvv: + + orq %r8,%r8 + je .L_after_reduction_GcmEpgzDnksqGvv + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_GcmEpgzDnksqGvv: + jmp .L_last_blocks_done_hpinkedxAsgwrDG +.L_last_num_blocks_is_6_hpinkedxAsgwrDG: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $250,%r15d + jae .L_16_blocks_overflow_wcFtAwbEGtnhhov + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %ymm27,%ymm0,%ymm3 + jmp .L_16_blocks_ok_wcFtAwbEGtnhhov + +.L_16_blocks_overflow_wcFtAwbEGtnhhov: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %ymm29,%ymm3,%ymm3 +.L_16_blocks_ok_wcFtAwbEGtnhhov: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $1,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %ymm30,%ymm3,%ymm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %ymm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %ymm29,%ymm19,%ymm19 + vextracti32x4 $1,%zmm19,%xmm7 + subq $16 * (6 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_ljhumqErtfjivdq + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_ljhumqErtfjivdq +.L_small_initial_partial_block_ljhumqErtfjivdq: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_ljhumqErtfjivdq: + + orq %r8,%r8 + je .L_after_reduction_ljhumqErtfjivdq + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_ljhumqErtfjivdq: + jmp .L_last_blocks_done_hpinkedxAsgwrDG +.L_last_num_blocks_is_7_hpinkedxAsgwrDG: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $249,%r15d + jae .L_16_blocks_overflow_xipoAqDkcCyBFhx + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_xipoAqDkcCyBFhx + +.L_16_blocks_overflow_xipoAqDkcCyBFhx: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_xipoAqDkcCyBFhx: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $2,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $2,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vextracti32x4 $2,%zmm19,%xmm7 + subq $16 * (7 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_jeohFFoGiiGxanC + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_jeohFFoGiiGxanC +.L_small_initial_partial_block_jeohFFoGiiGxanC: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_jeohFFoGiiGxanC: + + orq %r8,%r8 + je .L_after_reduction_jeohFFoGiiGxanC + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_jeohFFoGiiGxanC: + jmp .L_last_blocks_done_hpinkedxAsgwrDG +.L_last_num_blocks_is_8_hpinkedxAsgwrDG: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $248,%r15d + jae .L_16_blocks_overflow_CxhquljwEiGywcd + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_CxhquljwEiGywcd + +.L_16_blocks_overflow_CxhquljwEiGywcd: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_CxhquljwEiGywcd: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $3,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $3,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vextracti32x4 $3,%zmm19,%xmm7 + subq $16 * (8 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_eqywyFyndjkBDnx + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_eqywyFyndjkBDnx +.L_small_initial_partial_block_eqywyFyndjkBDnx: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_eqywyFyndjkBDnx: + + orq %r8,%r8 + je .L_after_reduction_eqywyFyndjkBDnx + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_eqywyFyndjkBDnx: + jmp .L_last_blocks_done_hpinkedxAsgwrDG +.L_last_num_blocks_is_9_hpinkedxAsgwrDG: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $247,%r15d + jae .L_16_blocks_overflow_tqfxslkwuCurEnc + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %xmm27,%xmm3,%xmm4 + jmp .L_16_blocks_ok_tqfxslkwuCurEnc + +.L_16_blocks_overflow_tqfxslkwuCurEnc: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %xmm29,%xmm4,%xmm4 +.L_16_blocks_ok_tqfxslkwuCurEnc: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $0,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %xmm30,%xmm4,%xmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %xmm20,%xmm4,%xmm4 + vextracti32x4 $0,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %xmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %xmm29,%xmm20,%xmm20 + vextracti32x4 $0,%zmm20,%xmm7 + subq $16 * (9 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_pxwcCmexoxpnkgA + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_pxwcCmexoxpnkgA +.L_small_initial_partial_block_pxwcCmexoxpnkgA: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_pxwcCmexoxpnkgA: + + orq %r8,%r8 + je .L_after_reduction_pxwcCmexoxpnkgA + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_pxwcCmexoxpnkgA: + jmp .L_last_blocks_done_hpinkedxAsgwrDG +.L_last_num_blocks_is_10_hpinkedxAsgwrDG: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $246,%r15d + jae .L_16_blocks_overflow_tiwCrijFxfsopuz + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %ymm27,%ymm3,%ymm4 + jmp .L_16_blocks_ok_tiwCrijFxfsopuz + +.L_16_blocks_overflow_tiwCrijFxfsopuz: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %ymm29,%ymm4,%ymm4 +.L_16_blocks_ok_tiwCrijFxfsopuz: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $1,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %ymm30,%ymm4,%ymm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %ymm20,%ymm4,%ymm4 + vextracti32x4 $1,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %ymm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %ymm29,%ymm20,%ymm20 + vextracti32x4 $1,%zmm20,%xmm7 + subq $16 * (10 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_rjgbwiCDGnxhaGp + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_rjgbwiCDGnxhaGp +.L_small_initial_partial_block_rjgbwiCDGnxhaGp: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_rjgbwiCDGnxhaGp: + + orq %r8,%r8 + je .L_after_reduction_rjgbwiCDGnxhaGp + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_rjgbwiCDGnxhaGp: + jmp .L_last_blocks_done_hpinkedxAsgwrDG +.L_last_num_blocks_is_11_hpinkedxAsgwrDG: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $245,%r15d + jae .L_16_blocks_overflow_wphxdqsnBGrxkBa + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_wphxdqsnBGrxkBa + +.L_16_blocks_overflow_wphxdqsnBGrxkBa: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_wphxdqsnBGrxkBa: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $2,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $2,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vextracti32x4 $2,%zmm20,%xmm7 + subq $16 * (11 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_DAeDyvlteBcjnnm + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_DAeDyvlteBcjnnm +.L_small_initial_partial_block_DAeDyvlteBcjnnm: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_DAeDyvlteBcjnnm: + + orq %r8,%r8 + je .L_after_reduction_DAeDyvlteBcjnnm + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_DAeDyvlteBcjnnm: + jmp .L_last_blocks_done_hpinkedxAsgwrDG +.L_last_num_blocks_is_12_hpinkedxAsgwrDG: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $244,%r15d + jae .L_16_blocks_overflow_btzqkvdAeDABvcj + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_btzqkvdAeDABvcj + +.L_16_blocks_overflow_btzqkvdAeDABvcj: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_btzqkvdAeDABvcj: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $3,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $3,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vextracti32x4 $3,%zmm20,%xmm7 + subq $16 * (12 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_BAFapfuAGyFkstm + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_BAFapfuAGyFkstm +.L_small_initial_partial_block_BAFapfuAGyFkstm: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_BAFapfuAGyFkstm: + + orq %r8,%r8 + je .L_after_reduction_BAFapfuAGyFkstm + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_BAFapfuAGyFkstm: + jmp .L_last_blocks_done_hpinkedxAsgwrDG +.L_last_num_blocks_is_13_hpinkedxAsgwrDG: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $243,%r15d + jae .L_16_blocks_overflow_eqBacrjkweGnBBv + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %xmm27,%xmm4,%xmm5 + jmp .L_16_blocks_ok_eqBacrjkweGnBBv + +.L_16_blocks_overflow_eqBacrjkweGnBBv: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %xmm29,%xmm5,%xmm5 +.L_16_blocks_ok_eqBacrjkweGnBBv: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $0,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %xmm30,%xmm5,%xmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %xmm21,%xmm5,%xmm5 + vextracti32x4 $0,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %xmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %xmm29,%xmm21,%xmm21 + vextracti32x4 $0,%zmm21,%xmm7 + subq $16 * (13 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_zzCAagwwuuueoBh + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_zzCAagwwuuueoBh +.L_small_initial_partial_block_zzCAagwwuuueoBh: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_zzCAagwwuuueoBh: + + orq %r8,%r8 + je .L_after_reduction_zzCAagwwuuueoBh + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_zzCAagwwuuueoBh: + jmp .L_last_blocks_done_hpinkedxAsgwrDG +.L_last_num_blocks_is_14_hpinkedxAsgwrDG: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $242,%r15d + jae .L_16_blocks_overflow_hBvbhuzsjeqFuma + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %ymm27,%ymm4,%ymm5 + jmp .L_16_blocks_ok_hBvbhuzsjeqFuma + +.L_16_blocks_overflow_hBvbhuzsjeqFuma: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %ymm29,%ymm5,%ymm5 +.L_16_blocks_ok_hBvbhuzsjeqFuma: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $1,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %ymm30,%ymm5,%ymm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %ymm21,%ymm5,%ymm5 + vextracti32x4 $1,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %ymm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %ymm29,%ymm21,%ymm21 + vextracti32x4 $1,%zmm21,%xmm7 + subq $16 * (14 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_mwionbCzEjjlanp + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_mwionbCzEjjlanp +.L_small_initial_partial_block_mwionbCzEjjlanp: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_mwionbCzEjjlanp: + + orq %r8,%r8 + je .L_after_reduction_mwionbCzEjjlanp + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_mwionbCzEjjlanp: + jmp .L_last_blocks_done_hpinkedxAsgwrDG +.L_last_num_blocks_is_15_hpinkedxAsgwrDG: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $241,%r15d + jae .L_16_blocks_overflow_BDaqedvcvzqmjwo + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_BDaqedvcvzqmjwo + +.L_16_blocks_overflow_BDaqedvcvzqmjwo: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_BDaqedvcvzqmjwo: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $2,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $2,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %zmm29,%zmm21,%zmm21 + vextracti32x4 $2,%zmm21,%xmm7 + subq $16 * (15 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_EFDnDGjBfhFbjps + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_EFDnDGjBfhFbjps +.L_small_initial_partial_block_EFDnDGjBfhFbjps: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_EFDnDGjBfhFbjps: + + orq %r8,%r8 + je .L_after_reduction_EFDnDGjBfhFbjps + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_EFDnDGjBfhFbjps: + jmp .L_last_blocks_done_hpinkedxAsgwrDG +.L_last_num_blocks_is_16_hpinkedxAsgwrDG: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $240,%r15d + jae .L_16_blocks_overflow_etaGdjDbzcppuhm + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_etaGdjDbzcppuhm + +.L_16_blocks_overflow_etaGdjDbzcppuhm: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_etaGdjDbzcppuhm: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $3,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $3,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %zmm29,%zmm21,%zmm21 + vextracti32x4 $3,%zmm21,%xmm7 + subq $16 * (16 - 1),%r8 +.L_small_initial_partial_block_zcehcCvffqhlrEC: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_zcehcCvffqhlrEC: + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_zcehcCvffqhlrEC: + jmp .L_last_blocks_done_hpinkedxAsgwrDG +.L_last_num_blocks_is_0_hpinkedxAsgwrDG: + vmovdqa64 1280(%rsp),%zmm13 + vmovdqu64 512(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1344(%rsp),%zmm13 + vmovdqu64 576(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 1408(%rsp),%zmm13 + vmovdqu64 640(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1472(%rsp),%zmm13 + vmovdqu64 704(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm4 + + + vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0 + vpslldq $8,%xmm0,%xmm0 + vpxorq %xmm0,%xmm25,%xmm0 + + + vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3 + vpsrldq $4,%xmm3,%xmm3 + vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm3,%xmm14 + +.L_last_blocks_done_hpinkedxAsgwrDG: + vpshufb %xmm29,%xmm2,%xmm2 + jmp .L_ghash_done_brADimEeCnCcDmv + +.L_message_below_32_blocks_brADimEeCnCcDmv: + + + subq $256,%r8 + addq $256,%r11 + movl %r8d,%r10d + testq %r14,%r14 + jnz .L_skip_hkeys_precomputation_hlnFoocmixcFBsB + vmovdqu64 640(%rsp),%zmm3 + + + vshufi64x2 $0x00,%zmm3,%zmm3,%zmm3 + + vmovdqu64 576(%rsp),%zmm4 + vmovdqu64 512(%rsp),%zmm5 + + vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4 + vpxorq %zmm10,%zmm4,%zmm4 + + vpsrldq $8,%zmm4,%zmm10 + vpslldq $8,%zmm4,%zmm4 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4 + vpslldq $4,%zmm4,%zmm4 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm4 + + vmovdqu64 %zmm4,448(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5 + vpxorq %zmm10,%zmm5,%zmm5 + + vpsrldq $8,%zmm5,%zmm10 + vpslldq $8,%zmm5,%zmm5 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5 + vpslldq $4,%zmm5,%zmm5 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm5 + + vmovdqu64 %zmm5,384(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4 + vpxorq %zmm10,%zmm4,%zmm4 + + vpsrldq $8,%zmm4,%zmm10 + vpslldq $8,%zmm4,%zmm4 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4 + vpslldq $4,%zmm4,%zmm4 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm4 + + vmovdqu64 %zmm4,320(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5 + vpxorq %zmm10,%zmm5,%zmm5 + + vpsrldq $8,%zmm5,%zmm10 + vpslldq $8,%zmm5,%zmm5 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5 + vpslldq $4,%zmm5,%zmm5 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm5 + + vmovdqu64 %zmm5,256(%rsp) +.L_skip_hkeys_precomputation_hlnFoocmixcFBsB: + movq $1,%r14 + andl $~15,%r10d + movl $512,%ebx + subl %r10d,%ebx + movl %r8d,%r10d + addl $15,%r10d + shrl $4,%r10d + je .L_last_num_blocks_is_0_ytkmwztBxmufdeg + + cmpl $8,%r10d + je .L_last_num_blocks_is_8_ytkmwztBxmufdeg + jb .L_last_num_blocks_is_7_1_ytkmwztBxmufdeg + + + cmpl $12,%r10d + je .L_last_num_blocks_is_12_ytkmwztBxmufdeg + jb .L_last_num_blocks_is_11_9_ytkmwztBxmufdeg + + + cmpl $15,%r10d + je .L_last_num_blocks_is_15_ytkmwztBxmufdeg + ja .L_last_num_blocks_is_16_ytkmwztBxmufdeg + cmpl $14,%r10d + je .L_last_num_blocks_is_14_ytkmwztBxmufdeg + jmp .L_last_num_blocks_is_13_ytkmwztBxmufdeg + +.L_last_num_blocks_is_11_9_ytkmwztBxmufdeg: + + cmpl $10,%r10d + je .L_last_num_blocks_is_10_ytkmwztBxmufdeg + ja .L_last_num_blocks_is_11_ytkmwztBxmufdeg + jmp .L_last_num_blocks_is_9_ytkmwztBxmufdeg + +.L_last_num_blocks_is_7_1_ytkmwztBxmufdeg: + cmpl $4,%r10d + je .L_last_num_blocks_is_4_ytkmwztBxmufdeg + jb .L_last_num_blocks_is_3_1_ytkmwztBxmufdeg + + cmpl $6,%r10d + ja .L_last_num_blocks_is_7_ytkmwztBxmufdeg + je .L_last_num_blocks_is_6_ytkmwztBxmufdeg + jmp .L_last_num_blocks_is_5_ytkmwztBxmufdeg + +.L_last_num_blocks_is_3_1_ytkmwztBxmufdeg: + + cmpl $2,%r10d + ja .L_last_num_blocks_is_3_ytkmwztBxmufdeg + je .L_last_num_blocks_is_2_ytkmwztBxmufdeg +.L_last_num_blocks_is_1_ytkmwztBxmufdeg: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $255,%r15d + jae .L_16_blocks_overflow_bGwqvrBoAiaAwkr + vpaddd %xmm28,%xmm2,%xmm0 + jmp .L_16_blocks_ok_bGwqvrBoAiaAwkr + +.L_16_blocks_overflow_bGwqvrBoAiaAwkr: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %xmm29,%xmm0,%xmm0 +.L_16_blocks_ok_bGwqvrBoAiaAwkr: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z} + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vaesenclast %xmm30,%xmm0,%xmm0 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti32x4 $0,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %xmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %xmm29,%xmm17,%xmm17 + vextracti32x4 $0,%zmm17,%xmm7 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_dqohylvpeBErAsj + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_dqohylvpeBErAsj +.L_small_initial_partial_block_dqohylvpeBErAsj: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm0 + + + vpclmulqdq $0x01,%xmm25,%xmm0,%xmm3 + vpslldq $8,%xmm3,%xmm3 + vpxorq %xmm3,%xmm25,%xmm3 + + + vpclmulqdq $0x00,%xmm3,%xmm0,%xmm4 + vpsrldq $4,%xmm4,%xmm4 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm4,%xmm14 + + + + + + + + + + + + + vpxorq %xmm7,%xmm14,%xmm14 + + jmp .L_after_reduction_dqohylvpeBErAsj +.L_small_initial_compute_done_dqohylvpeBErAsj: +.L_after_reduction_dqohylvpeBErAsj: + jmp .L_last_blocks_done_ytkmwztBxmufdeg +.L_last_num_blocks_is_2_ytkmwztBxmufdeg: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $254,%r15d + jae .L_16_blocks_overflow_lsDChrkFfFrGvvk + vpaddd %ymm28,%ymm2,%ymm0 + jmp .L_16_blocks_ok_lsDChrkFfFrGvvk + +.L_16_blocks_overflow_lsDChrkFfFrGvvk: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %ymm29,%ymm0,%ymm0 +.L_16_blocks_ok_lsDChrkFfFrGvvk: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z} + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vaesenclast %ymm30,%ymm0,%ymm0 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %ymm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %ymm29,%ymm17,%ymm17 + vextracti32x4 $1,%zmm17,%xmm7 + subq $16 * (2 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_Bgmdyvgptvfwdit + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_Bgmdyvgptvfwdit +.L_small_initial_partial_block_Bgmdyvgptvfwdit: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_Bgmdyvgptvfwdit: + + orq %r8,%r8 + je .L_after_reduction_Bgmdyvgptvfwdit + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_Bgmdyvgptvfwdit: + jmp .L_last_blocks_done_ytkmwztBxmufdeg +.L_last_num_blocks_is_3_ytkmwztBxmufdeg: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $253,%r15d + jae .L_16_blocks_overflow_srEocbwAwxsxpma + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_srEocbwAwxsxpma + +.L_16_blocks_overflow_srEocbwAwxsxpma: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_srEocbwAwxsxpma: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $2,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vextracti32x4 $2,%zmm17,%xmm7 + subq $16 * (3 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_ErkzfxFAbndCAAg + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_ErkzfxFAbndCAAg +.L_small_initial_partial_block_ErkzfxFAbndCAAg: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_ErkzfxFAbndCAAg: + + orq %r8,%r8 + je .L_after_reduction_ErkzfxFAbndCAAg + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_ErkzfxFAbndCAAg: + jmp .L_last_blocks_done_ytkmwztBxmufdeg +.L_last_num_blocks_is_4_ytkmwztBxmufdeg: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $252,%r15d + jae .L_16_blocks_overflow_wbyjFiCBFhEhwdm + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_wbyjFiCBFhEhwdm + +.L_16_blocks_overflow_wbyjFiCBFhEhwdm: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_wbyjFiCBFhEhwdm: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $3,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vextracti32x4 $3,%zmm17,%xmm7 + subq $16 * (4 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_sEeExElgbeebmrl + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_sEeExElgbeebmrl +.L_small_initial_partial_block_sEeExElgbeebmrl: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_sEeExElgbeebmrl: + + orq %r8,%r8 + je .L_after_reduction_sEeExElgbeebmrl + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_sEeExElgbeebmrl: + jmp .L_last_blocks_done_ytkmwztBxmufdeg +.L_last_num_blocks_is_5_ytkmwztBxmufdeg: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $251,%r15d + jae .L_16_blocks_overflow_FhnyaskgxleEyeh + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %xmm27,%xmm0,%xmm3 + jmp .L_16_blocks_ok_FhnyaskgxleEyeh + +.L_16_blocks_overflow_FhnyaskgxleEyeh: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %xmm29,%xmm3,%xmm3 +.L_16_blocks_ok_FhnyaskgxleEyeh: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %xmm30,%xmm3,%xmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %xmm19,%xmm3,%xmm3 + vextracti32x4 $0,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %xmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %xmm29,%xmm19,%xmm19 + vextracti32x4 $0,%zmm19,%xmm7 + subq $16 * (5 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_wcgcyCwrColDBul + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_wcgcyCwrColDBul +.L_small_initial_partial_block_wcgcyCwrColDBul: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_wcgcyCwrColDBul: + + orq %r8,%r8 + je .L_after_reduction_wcgcyCwrColDBul + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_wcgcyCwrColDBul: + jmp .L_last_blocks_done_ytkmwztBxmufdeg +.L_last_num_blocks_is_6_ytkmwztBxmufdeg: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $250,%r15d + jae .L_16_blocks_overflow_EfyidiDbmAaAaju + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %ymm27,%ymm0,%ymm3 + jmp .L_16_blocks_ok_EfyidiDbmAaAaju + +.L_16_blocks_overflow_EfyidiDbmAaAaju: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %ymm29,%ymm3,%ymm3 +.L_16_blocks_ok_EfyidiDbmAaAaju: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %ymm30,%ymm3,%ymm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %ymm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %ymm29,%ymm19,%ymm19 + vextracti32x4 $1,%zmm19,%xmm7 + subq $16 * (6 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_jGjykEdEyDattqe + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_jGjykEdEyDattqe +.L_small_initial_partial_block_jGjykEdEyDattqe: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_jGjykEdEyDattqe: + + orq %r8,%r8 + je .L_after_reduction_jGjykEdEyDattqe + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_jGjykEdEyDattqe: + jmp .L_last_blocks_done_ytkmwztBxmufdeg +.L_last_num_blocks_is_7_ytkmwztBxmufdeg: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $249,%r15d + jae .L_16_blocks_overflow_mzDdvEgkDwBlewp + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_mzDdvEgkDwBlewp + +.L_16_blocks_overflow_mzDdvEgkDwBlewp: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_mzDdvEgkDwBlewp: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $2,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vextracti32x4 $2,%zmm19,%xmm7 + subq $16 * (7 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_zwgGbbACgGfeFja + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_zwgGbbACgGfeFja +.L_small_initial_partial_block_zwgGbbACgGfeFja: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_zwgGbbACgGfeFja: + + orq %r8,%r8 + je .L_after_reduction_zwgGbbACgGfeFja + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_zwgGbbACgGfeFja: + jmp .L_last_blocks_done_ytkmwztBxmufdeg +.L_last_num_blocks_is_8_ytkmwztBxmufdeg: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $248,%r15d + jae .L_16_blocks_overflow_jqmGdhzdkozCBlA + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_jqmGdhzdkozCBlA + +.L_16_blocks_overflow_jqmGdhzdkozCBlA: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_jqmGdhzdkozCBlA: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $3,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vextracti32x4 $3,%zmm19,%xmm7 + subq $16 * (8 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_Daizbjyimqaduru + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_Daizbjyimqaduru +.L_small_initial_partial_block_Daizbjyimqaduru: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_Daizbjyimqaduru: + + orq %r8,%r8 + je .L_after_reduction_Daizbjyimqaduru + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_Daizbjyimqaduru: + jmp .L_last_blocks_done_ytkmwztBxmufdeg +.L_last_num_blocks_is_9_ytkmwztBxmufdeg: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $247,%r15d + jae .L_16_blocks_overflow_CDuwyvGbafyeBuk + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %xmm27,%xmm3,%xmm4 + jmp .L_16_blocks_ok_CDuwyvGbafyeBuk + +.L_16_blocks_overflow_CDuwyvGbafyeBuk: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %xmm29,%xmm4,%xmm4 +.L_16_blocks_ok_CDuwyvGbafyeBuk: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %xmm30,%xmm4,%xmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %xmm20,%xmm4,%xmm4 + vextracti32x4 $0,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %xmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %xmm29,%xmm20,%xmm20 + vextracti32x4 $0,%zmm20,%xmm7 + subq $16 * (9 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_kpAafwlxkcfbCCh + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_kpAafwlxkcfbCCh +.L_small_initial_partial_block_kpAafwlxkcfbCCh: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_kpAafwlxkcfbCCh: + + orq %r8,%r8 + je .L_after_reduction_kpAafwlxkcfbCCh + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_kpAafwlxkcfbCCh: + jmp .L_last_blocks_done_ytkmwztBxmufdeg +.L_last_num_blocks_is_10_ytkmwztBxmufdeg: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $246,%r15d + jae .L_16_blocks_overflow_tDtiElGDCfanulC + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %ymm27,%ymm3,%ymm4 + jmp .L_16_blocks_ok_tDtiElGDCfanulC + +.L_16_blocks_overflow_tDtiElGDCfanulC: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %ymm29,%ymm4,%ymm4 +.L_16_blocks_ok_tDtiElGDCfanulC: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %ymm30,%ymm4,%ymm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %ymm20,%ymm4,%ymm4 + vextracti32x4 $1,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %ymm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %ymm29,%ymm20,%ymm20 + vextracti32x4 $1,%zmm20,%xmm7 + subq $16 * (10 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_zphfokajCjwqcAg + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_zphfokajCjwqcAg +.L_small_initial_partial_block_zphfokajCjwqcAg: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_zphfokajCjwqcAg: + + orq %r8,%r8 + je .L_after_reduction_zphfokajCjwqcAg + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_zphfokajCjwqcAg: + jmp .L_last_blocks_done_ytkmwztBxmufdeg +.L_last_num_blocks_is_11_ytkmwztBxmufdeg: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $245,%r15d + jae .L_16_blocks_overflow_wqmiytsuGwmqxEk + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_wqmiytsuGwmqxEk + +.L_16_blocks_overflow_wqmiytsuGwmqxEk: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_wqmiytsuGwmqxEk: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $2,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vextracti32x4 $2,%zmm20,%xmm7 + subq $16 * (11 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_DlBrprmzzykyokm + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_DlBrprmzzykyokm +.L_small_initial_partial_block_DlBrprmzzykyokm: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_DlBrprmzzykyokm: + + orq %r8,%r8 + je .L_after_reduction_DlBrprmzzykyokm + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_DlBrprmzzykyokm: + jmp .L_last_blocks_done_ytkmwztBxmufdeg +.L_last_num_blocks_is_12_ytkmwztBxmufdeg: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $244,%r15d + jae .L_16_blocks_overflow_annCtoGejoBwwxn + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_annCtoGejoBwwxn + +.L_16_blocks_overflow_annCtoGejoBwwxn: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_annCtoGejoBwwxn: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $3,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vextracti32x4 $3,%zmm20,%xmm7 + subq $16 * (12 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_viBlGurDavwztrf + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_viBlGurDavwztrf +.L_small_initial_partial_block_viBlGurDavwztrf: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_viBlGurDavwztrf: + + orq %r8,%r8 + je .L_after_reduction_viBlGurDavwztrf + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_viBlGurDavwztrf: + jmp .L_last_blocks_done_ytkmwztBxmufdeg +.L_last_num_blocks_is_13_ytkmwztBxmufdeg: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $243,%r15d + jae .L_16_blocks_overflow_zmshcCvwkdwGlaB + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %xmm27,%xmm4,%xmm5 + jmp .L_16_blocks_ok_zmshcCvwkdwGlaB + +.L_16_blocks_overflow_zmshcCvwkdwGlaB: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %xmm29,%xmm5,%xmm5 +.L_16_blocks_ok_zmshcCvwkdwGlaB: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %xmm30,%xmm5,%xmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %xmm21,%xmm5,%xmm5 + vextracti32x4 $0,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %xmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %xmm29,%xmm21,%xmm21 + vextracti32x4 $0,%zmm21,%xmm7 + subq $16 * (13 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_kqdfAoFcBDkeGbm + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_kqdfAoFcBDkeGbm +.L_small_initial_partial_block_kqdfAoFcBDkeGbm: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_kqdfAoFcBDkeGbm: + + orq %r8,%r8 + je .L_after_reduction_kqdfAoFcBDkeGbm + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_kqdfAoFcBDkeGbm: + jmp .L_last_blocks_done_ytkmwztBxmufdeg +.L_last_num_blocks_is_14_ytkmwztBxmufdeg: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $242,%r15d + jae .L_16_blocks_overflow_boziaaCCygjjfxw + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %ymm27,%ymm4,%ymm5 + jmp .L_16_blocks_ok_boziaaCCygjjfxw + +.L_16_blocks_overflow_boziaaCCygjjfxw: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %ymm29,%ymm5,%ymm5 +.L_16_blocks_ok_boziaaCCygjjfxw: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %ymm30,%ymm5,%ymm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %ymm21,%ymm5,%ymm5 + vextracti32x4 $1,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %ymm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %ymm29,%ymm21,%ymm21 + vextracti32x4 $1,%zmm21,%xmm7 + subq $16 * (14 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_znbGdxrosrCeabB + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_znbGdxrosrCeabB +.L_small_initial_partial_block_znbGdxrosrCeabB: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_znbGdxrosrCeabB: + + orq %r8,%r8 + je .L_after_reduction_znbGdxrosrCeabB + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_znbGdxrosrCeabB: + jmp .L_last_blocks_done_ytkmwztBxmufdeg +.L_last_num_blocks_is_15_ytkmwztBxmufdeg: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $241,%r15d + jae .L_16_blocks_overflow_rliugxzwdyFGiBD + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_rliugxzwdyFGiBD + +.L_16_blocks_overflow_rliugxzwdyFGiBD: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_rliugxzwdyFGiBD: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $2,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %zmm29,%zmm21,%zmm21 + vextracti32x4 $2,%zmm21,%xmm7 + subq $16 * (15 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_olnbAdcngmvvEdn + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_olnbAdcngmvvEdn +.L_small_initial_partial_block_olnbAdcngmvvEdn: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_olnbAdcngmvvEdn: + + orq %r8,%r8 + je .L_after_reduction_olnbAdcngmvvEdn + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_olnbAdcngmvvEdn: + jmp .L_last_blocks_done_ytkmwztBxmufdeg +.L_last_num_blocks_is_16_ytkmwztBxmufdeg: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $240,%r15d + jae .L_16_blocks_overflow_gmEGrjFikmwGcAm + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_gmEGrjFikmwGcAm + +.L_16_blocks_overflow_gmEGrjFikmwGcAm: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_gmEGrjFikmwGcAm: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $3,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %zmm29,%zmm21,%zmm21 + vextracti32x4 $3,%zmm21,%xmm7 + subq $16 * (16 - 1),%r8 +.L_small_initial_partial_block_dplntcAkoiBEkDo: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_dplntcAkoiBEkDo: + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_dplntcAkoiBEkDo: + jmp .L_last_blocks_done_ytkmwztBxmufdeg +.L_last_num_blocks_is_0_ytkmwztBxmufdeg: + vmovdqa64 768(%rsp),%zmm13 + vpxorq %zmm14,%zmm13,%zmm13 + vmovdqu64 0(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 832(%rsp),%zmm13 + vmovdqu64 64(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpxorq %zmm10,%zmm4,%zmm26 + vpxorq %zmm6,%zmm0,%zmm24 + vpxorq %zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 896(%rsp),%zmm13 + vmovdqu64 128(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 960(%rsp),%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm4 + + + vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0 + vpslldq $8,%xmm0,%xmm0 + vpxorq %xmm0,%xmm25,%xmm0 + + + vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3 + vpsrldq $4,%xmm3,%xmm3 + vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm3,%xmm14 + +.L_last_blocks_done_ytkmwztBxmufdeg: + vpshufb %xmm29,%xmm2,%xmm2 + jmp .L_ghash_done_brADimEeCnCcDmv + +.L_message_below_equal_16_blocks_brADimEeCnCcDmv: + + + movl %r8d,%r12d + addl $15,%r12d + shrl $4,%r12d + cmpq $8,%r12 + je .L_small_initial_num_blocks_is_8_nmhEfDfgEBvcjnt + jl .L_small_initial_num_blocks_is_7_1_nmhEfDfgEBvcjnt + + + cmpq $12,%r12 + je .L_small_initial_num_blocks_is_12_nmhEfDfgEBvcjnt + jl .L_small_initial_num_blocks_is_11_9_nmhEfDfgEBvcjnt + + + cmpq $16,%r12 + je .L_small_initial_num_blocks_is_16_nmhEfDfgEBvcjnt + cmpq $15,%r12 + je .L_small_initial_num_blocks_is_15_nmhEfDfgEBvcjnt + cmpq $14,%r12 + je .L_small_initial_num_blocks_is_14_nmhEfDfgEBvcjnt + jmp .L_small_initial_num_blocks_is_13_nmhEfDfgEBvcjnt + +.L_small_initial_num_blocks_is_11_9_nmhEfDfgEBvcjnt: + + cmpq $11,%r12 + je .L_small_initial_num_blocks_is_11_nmhEfDfgEBvcjnt + cmpq $10,%r12 + je .L_small_initial_num_blocks_is_10_nmhEfDfgEBvcjnt + jmp .L_small_initial_num_blocks_is_9_nmhEfDfgEBvcjnt + +.L_small_initial_num_blocks_is_7_1_nmhEfDfgEBvcjnt: + cmpq $4,%r12 + je .L_small_initial_num_blocks_is_4_nmhEfDfgEBvcjnt + jl .L_small_initial_num_blocks_is_3_1_nmhEfDfgEBvcjnt + + cmpq $7,%r12 + je .L_small_initial_num_blocks_is_7_nmhEfDfgEBvcjnt + cmpq $6,%r12 + je .L_small_initial_num_blocks_is_6_nmhEfDfgEBvcjnt + jmp .L_small_initial_num_blocks_is_5_nmhEfDfgEBvcjnt + +.L_small_initial_num_blocks_is_3_1_nmhEfDfgEBvcjnt: + + cmpq $3,%r12 + je .L_small_initial_num_blocks_is_3_nmhEfDfgEBvcjnt + cmpq $2,%r12 + je .L_small_initial_num_blocks_is_2_nmhEfDfgEBvcjnt + + + + + +.L_small_initial_num_blocks_is_1_nmhEfDfgEBvcjnt: + vmovdqa64 SHUF_MASK(%rip),%xmm29 + vpaddd ONE(%rip),%xmm2,%xmm0 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $0,%zmm0,%xmm2 + vpshufb %xmm29,%xmm0,%xmm0 + vmovdqu8 0(%rcx,%r11,1),%xmm6{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenclast %xmm15,%xmm0,%xmm0 + vpxorq %xmm6,%xmm0,%xmm0 + vextracti32x4 $0,%zmm0,%xmm12 + movq %r9,%r10 + vmovdqu8 %xmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %xmm29,%xmm6,%xmm6 + vextracti32x4 $0,%zmm6,%xmm13 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_AyfivemhvfDjwew + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm6,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm6,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm6,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm6,%xmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_AyfivemhvfDjwew +.L_small_initial_partial_block_AyfivemhvfDjwew: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + + + + + + + + + + + + vpxorq %xmm13,%xmm14,%xmm14 + + jmp .L_after_reduction_AyfivemhvfDjwew +.L_small_initial_compute_done_AyfivemhvfDjwew: +.L_after_reduction_AyfivemhvfDjwew: + jmp .L_small_initial_blocks_encrypted_nmhEfDfgEBvcjnt +.L_small_initial_num_blocks_is_2_nmhEfDfgEBvcjnt: + vmovdqa64 SHUF_MASK(%rip),%ymm29 + vshufi64x2 $0,%ymm2,%ymm2,%ymm0 + vpaddd ddq_add_1234(%rip),%ymm0,%ymm0 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $1,%zmm0,%xmm2 + vpshufb %ymm29,%ymm0,%ymm0 + vmovdqu8 0(%rcx,%r11,1),%ymm6{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenclast %ymm15,%ymm0,%ymm0 + vpxorq %ymm6,%ymm0,%ymm0 + vextracti32x4 $1,%zmm0,%xmm12 + movq %r9,%r10 + vmovdqu8 %ymm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %ymm29,%ymm6,%ymm6 + vextracti32x4 $1,%zmm6,%xmm13 + subq $16 * (2 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_mFdfDiDtuhyrCwk + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm6,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm6,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm6,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm6,%ymm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_mFdfDiDtuhyrCwk +.L_small_initial_partial_block_mFdfDiDtuhyrCwk: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm6,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm6,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm6,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm6,%xmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_mFdfDiDtuhyrCwk: + + orq %r8,%r8 + je .L_after_reduction_mFdfDiDtuhyrCwk + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_mFdfDiDtuhyrCwk: + jmp .L_small_initial_blocks_encrypted_nmhEfDfgEBvcjnt +.L_small_initial_num_blocks_is_3_nmhEfDfgEBvcjnt: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $2,%zmm0,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vmovdqu8 0(%rcx,%r11,1),%zmm6{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vpxorq %zmm6,%zmm0,%zmm0 + vextracti32x4 $2,%zmm0,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vextracti32x4 $2,%zmm6,%xmm13 + subq $16 * (3 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_AvGtGumzxshjiFB + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_AvGtGumzxshjiFB +.L_small_initial_partial_block_AvGtGumzxshjiFB: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm6,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm6,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm6,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm6,%ymm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_AvGtGumzxshjiFB: + + orq %r8,%r8 + je .L_after_reduction_AvGtGumzxshjiFB + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_AvGtGumzxshjiFB: + jmp .L_small_initial_blocks_encrypted_nmhEfDfgEBvcjnt +.L_small_initial_num_blocks_is_4_nmhEfDfgEBvcjnt: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $3,%zmm0,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vmovdqu8 0(%rcx,%r11,1),%zmm6{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vpxorq %zmm6,%zmm0,%zmm0 + vextracti32x4 $3,%zmm0,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vextracti32x4 $3,%zmm6,%xmm13 + subq $16 * (4 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_DbentnbaeCzAufz + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 288(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + + vpxorq %zmm19,%zmm17,%zmm17 + vpsrldq $8,%zmm17,%zmm4 + vpslldq $8,%zmm17,%zmm5 + vpxorq %zmm4,%zmm15,%zmm0 + vpxorq %zmm5,%zmm16,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_DbentnbaeCzAufz +.L_small_initial_partial_block_DbentnbaeCzAufz: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_DbentnbaeCzAufz: + + orq %r8,%r8 + je .L_after_reduction_DbentnbaeCzAufz + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_DbentnbaeCzAufz: + jmp .L_small_initial_blocks_encrypted_nmhEfDfgEBvcjnt +.L_small_initial_num_blocks_is_5_nmhEfDfgEBvcjnt: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $64,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $0,%zmm3,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %xmm29,%xmm3,%xmm3 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%xmm7{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %xmm15,%xmm3,%xmm3 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %xmm7,%xmm3,%xmm3 + vextracti32x4 $0,%zmm3,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %xmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vpshufb %xmm29,%xmm7,%xmm7 + vextracti32x4 $0,%zmm7,%xmm13 + subq $16 * (5 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_dnEAtijzGEDlswn + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 272(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm7,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm7,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm7,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm7,%xmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_dnEAtijzGEDlswn +.L_small_initial_partial_block_dnEAtijzGEDlswn: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 288(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + + vpxorq %zmm19,%zmm17,%zmm17 + vpsrldq $8,%zmm17,%zmm4 + vpslldq $8,%zmm17,%zmm5 + vpxorq %zmm4,%zmm15,%zmm0 + vpxorq %zmm5,%zmm16,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_dnEAtijzGEDlswn: + + orq %r8,%r8 + je .L_after_reduction_dnEAtijzGEDlswn + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_dnEAtijzGEDlswn: + jmp .L_small_initial_blocks_encrypted_nmhEfDfgEBvcjnt +.L_small_initial_num_blocks_is_6_nmhEfDfgEBvcjnt: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $64,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $1,%zmm3,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %ymm29,%ymm3,%ymm3 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%ymm7{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %ymm15,%ymm3,%ymm3 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %ymm7,%ymm3,%ymm3 + vextracti32x4 $1,%zmm3,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %ymm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vpshufb %ymm29,%ymm7,%ymm7 + vextracti32x4 $1,%zmm7,%xmm13 + subq $16 * (6 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_umqipkezFkCyFdu + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 256(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm7,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm7,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm7,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm7,%ymm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_umqipkezFkCyFdu +.L_small_initial_partial_block_umqipkezFkCyFdu: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 272(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm7,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm7,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm7,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm7,%xmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_umqipkezFkCyFdu: + + orq %r8,%r8 + je .L_after_reduction_umqipkezFkCyFdu + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_umqipkezFkCyFdu: + jmp .L_small_initial_blocks_encrypted_nmhEfDfgEBvcjnt +.L_small_initial_num_blocks_is_7_nmhEfDfgEBvcjnt: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $64,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $2,%zmm3,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vextracti32x4 $2,%zmm3,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vpshufb %zmm29,%zmm7,%zmm7 + vextracti32x4 $2,%zmm7,%xmm13 + subq $16 * (7 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_lEGtnzekhyuwBFz + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 240(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_lEGtnzekhyuwBFz +.L_small_initial_partial_block_lEGtnzekhyuwBFz: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 256(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm7,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm7,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm7,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm7,%ymm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_lEGtnzekhyuwBFz: + + orq %r8,%r8 + je .L_after_reduction_lEGtnzekhyuwBFz + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_lEGtnzekhyuwBFz: + jmp .L_small_initial_blocks_encrypted_nmhEfDfgEBvcjnt +.L_small_initial_num_blocks_is_8_nmhEfDfgEBvcjnt: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $64,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $3,%zmm3,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vextracti32x4 $3,%zmm3,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vpshufb %zmm29,%zmm7,%zmm7 + vextracti32x4 $3,%zmm7,%xmm13 + subq $16 * (8 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_EasGBEsimbhszDy + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 224(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 288(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + + vpxorq %zmm19,%zmm17,%zmm17 + vpsrldq $8,%zmm17,%zmm4 + vpslldq $8,%zmm17,%zmm5 + vpxorq %zmm4,%zmm15,%zmm0 + vpxorq %zmm5,%zmm16,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_EasGBEsimbhszDy +.L_small_initial_partial_block_EasGBEsimbhszDy: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 240(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_EasGBEsimbhszDy: + + orq %r8,%r8 + je .L_after_reduction_EasGBEsimbhszDy + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_EasGBEsimbhszDy: + jmp .L_small_initial_blocks_encrypted_nmhEfDfgEBvcjnt +.L_small_initial_num_blocks_is_9_nmhEfDfgEBvcjnt: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $128,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $0,%zmm4,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %xmm29,%xmm4,%xmm4 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%xmm10{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %xmm15,%xmm4,%xmm4 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %xmm10,%xmm4,%xmm4 + vextracti32x4 $0,%zmm4,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %xmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %xmm29,%xmm10,%xmm10 + vextracti32x4 $0,%zmm10,%xmm13 + subq $16 * (9 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_DlhndmhlkxypvAb + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 208(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 272(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm10,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm10,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm10,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm10,%xmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_DlhndmhlkxypvAb +.L_small_initial_partial_block_DlhndmhlkxypvAb: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 224(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 288(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + + vpxorq %zmm19,%zmm17,%zmm17 + vpsrldq $8,%zmm17,%zmm4 + vpslldq $8,%zmm17,%zmm5 + vpxorq %zmm4,%zmm15,%zmm0 + vpxorq %zmm5,%zmm16,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_DlhndmhlkxypvAb: + + orq %r8,%r8 + je .L_after_reduction_DlhndmhlkxypvAb + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_DlhndmhlkxypvAb: + jmp .L_small_initial_blocks_encrypted_nmhEfDfgEBvcjnt +.L_small_initial_num_blocks_is_10_nmhEfDfgEBvcjnt: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $128,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $1,%zmm4,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %ymm29,%ymm4,%ymm4 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%ymm10{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %ymm15,%ymm4,%ymm4 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %ymm10,%ymm4,%ymm4 + vextracti32x4 $1,%zmm4,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %ymm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %ymm29,%ymm10,%ymm10 + vextracti32x4 $1,%zmm10,%xmm13 + subq $16 * (10 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_cwsdomEqheptkED + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 192(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 256(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm10,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm10,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm10,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm10,%ymm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_cwsdomEqheptkED +.L_small_initial_partial_block_cwsdomEqheptkED: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 208(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 272(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm10,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm10,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm10,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm10,%xmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_cwsdomEqheptkED: + + orq %r8,%r8 + je .L_after_reduction_cwsdomEqheptkED + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_cwsdomEqheptkED: + jmp .L_small_initial_blocks_encrypted_nmhEfDfgEBvcjnt +.L_small_initial_num_blocks_is_11_nmhEfDfgEBvcjnt: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $128,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $2,%zmm4,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%zmm10{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %zmm15,%zmm4,%zmm4 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %zmm10,%zmm4,%zmm4 + vextracti32x4 $2,%zmm4,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %zmm29,%zmm10,%zmm10 + vextracti32x4 $2,%zmm10,%xmm13 + subq $16 * (11 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_qxeFvgzdwFFywqx + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 176(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 240(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_qxeFvgzdwFFywqx +.L_small_initial_partial_block_qxeFvgzdwFFywqx: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 192(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 256(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm10,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm10,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm10,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm10,%ymm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_qxeFvgzdwFFywqx: + + orq %r8,%r8 + je .L_after_reduction_qxeFvgzdwFFywqx + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_qxeFvgzdwFFywqx: + jmp .L_small_initial_blocks_encrypted_nmhEfDfgEBvcjnt +.L_small_initial_num_blocks_is_12_nmhEfDfgEBvcjnt: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $128,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $3,%zmm4,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%zmm10{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %zmm15,%zmm4,%zmm4 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %zmm10,%zmm4,%zmm4 + vextracti32x4 $3,%zmm4,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %zmm29,%zmm10,%zmm10 + vextracti32x4 $3,%zmm10,%xmm13 + subq $16 * (12 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_oqzAvlGuDiExAmm + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 160(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 224(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 288(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + + vpxorq %zmm19,%zmm17,%zmm17 + vpsrldq $8,%zmm17,%zmm4 + vpslldq $8,%zmm17,%zmm5 + vpxorq %zmm4,%zmm15,%zmm0 + vpxorq %zmm5,%zmm16,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_oqzAvlGuDiExAmm +.L_small_initial_partial_block_oqzAvlGuDiExAmm: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 176(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 240(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_oqzAvlGuDiExAmm: + + orq %r8,%r8 + je .L_after_reduction_oqzAvlGuDiExAmm + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_oqzAvlGuDiExAmm: + jmp .L_small_initial_blocks_encrypted_nmhEfDfgEBvcjnt +.L_small_initial_num_blocks_is_13_nmhEfDfgEBvcjnt: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + vpaddd ddq_add_8888(%rip),%zmm3,%zmm5 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $192,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $0,%zmm5,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %xmm29,%xmm5,%xmm5 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%zmm10 + vmovdqu8 192(%rcx,%r11,1),%xmm11{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %zmm15,%zmm4,%zmm4 + vpxorq %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %zmm15,%zmm4,%zmm4 + vaesenclast %xmm15,%xmm5,%xmm5 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %zmm10,%zmm4,%zmm4 + vpxorq %xmm11,%xmm5,%xmm5 + vextracti32x4 $0,%zmm5,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %xmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %zmm29,%zmm10,%zmm10 + vpshufb %xmm29,%xmm11,%xmm11 + vextracti32x4 $0,%zmm11,%xmm13 + subq $16 * (13 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_yqGygqlhwnnpjbq + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 144(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 208(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 272(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm11,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm11,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm11,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm11,%xmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_yqGygqlhwnnpjbq +.L_small_initial_partial_block_yqGygqlhwnnpjbq: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 160(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 224(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 288(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + + vpxorq %zmm19,%zmm17,%zmm17 + vpsrldq $8,%zmm17,%zmm4 + vpslldq $8,%zmm17,%zmm5 + vpxorq %zmm4,%zmm15,%zmm0 + vpxorq %zmm5,%zmm16,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_yqGygqlhwnnpjbq: + + orq %r8,%r8 + je .L_after_reduction_yqGygqlhwnnpjbq + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_yqGygqlhwnnpjbq: + jmp .L_small_initial_blocks_encrypted_nmhEfDfgEBvcjnt +.L_small_initial_num_blocks_is_14_nmhEfDfgEBvcjnt: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + vpaddd ddq_add_8888(%rip),%zmm3,%zmm5 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $192,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $1,%zmm5,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %ymm29,%ymm5,%ymm5 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%zmm10 + vmovdqu8 192(%rcx,%r11,1),%ymm11{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %zmm15,%zmm4,%zmm4 + vpxorq %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %zmm15,%zmm4,%zmm4 + vaesenclast %ymm15,%ymm5,%ymm5 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %zmm10,%zmm4,%zmm4 + vpxorq %ymm11,%ymm5,%ymm5 + vextracti32x4 $1,%zmm5,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %ymm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %zmm29,%zmm10,%zmm10 + vpshufb %ymm29,%ymm11,%ymm11 + vextracti32x4 $1,%zmm11,%xmm13 + subq $16 * (14 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_wByexunpeunlcgC + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 128(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 192(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 256(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm11,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm11,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm11,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm11,%ymm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_wByexunpeunlcgC +.L_small_initial_partial_block_wByexunpeunlcgC: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 144(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 208(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 272(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm11,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm11,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm11,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm11,%xmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_wByexunpeunlcgC: + + orq %r8,%r8 + je .L_after_reduction_wByexunpeunlcgC + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_wByexunpeunlcgC: + jmp .L_small_initial_blocks_encrypted_nmhEfDfgEBvcjnt +.L_small_initial_num_blocks_is_15_nmhEfDfgEBvcjnt: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + vpaddd ddq_add_8888(%rip),%zmm3,%zmm5 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $192,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $2,%zmm5,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%zmm10 + vmovdqu8 192(%rcx,%r11,1),%zmm11{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %zmm15,%zmm4,%zmm4 + vpxorq %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %zmm15,%zmm4,%zmm4 + vaesenclast %zmm15,%zmm5,%zmm5 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %zmm10,%zmm4,%zmm4 + vpxorq %zmm11,%zmm5,%zmm5 + vextracti32x4 $2,%zmm5,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %zmm29,%zmm10,%zmm10 + vpshufb %zmm29,%zmm11,%zmm11 + vextracti32x4 $2,%zmm11,%xmm13 + subq $16 * (15 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_sAhCDvCwGcBErvs + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 112(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 176(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 240(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm11,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm11,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm11,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm11,%zmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_sAhCDvCwGcBErvs +.L_small_initial_partial_block_sAhCDvCwGcBErvs: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 128(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 192(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 256(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm11,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm11,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm11,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm11,%ymm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_sAhCDvCwGcBErvs: + + orq %r8,%r8 + je .L_after_reduction_sAhCDvCwGcBErvs + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_sAhCDvCwGcBErvs: + jmp .L_small_initial_blocks_encrypted_nmhEfDfgEBvcjnt +.L_small_initial_num_blocks_is_16_nmhEfDfgEBvcjnt: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + vpaddd ddq_add_8888(%rip),%zmm3,%zmm5 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $192,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $3,%zmm5,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%zmm10 + vmovdqu8 192(%rcx,%r11,1),%zmm11{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %zmm15,%zmm4,%zmm4 + vpxorq %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %zmm15,%zmm4,%zmm4 + vaesenclast %zmm15,%zmm5,%zmm5 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %zmm10,%zmm4,%zmm4 + vpxorq %zmm11,%zmm5,%zmm5 + vextracti32x4 $3,%zmm5,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %zmm29,%zmm10,%zmm10 + vpshufb %zmm29,%zmm11,%zmm11 + vextracti32x4 $3,%zmm11,%xmm13 + subq $16 * (16 - 1),%r8 +.L_small_initial_partial_block_BGcpniuuBjzyonj: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 112(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 176(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 240(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm11,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm11,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm11,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm11,%zmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_BGcpniuuBjzyonj: + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_BGcpniuuBjzyonj: +.L_small_initial_blocks_encrypted_nmhEfDfgEBvcjnt: +.L_ghash_done_brADimEeCnCcDmv: + vmovdqu64 %xmm2,0(%rsi) + vmovdqu64 %xmm14,64(%rsi) +.L_enc_dec_done_brADimEeCnCcDmv: + jmp .Lexit_gcm_decrypt +.align 32 +.Laes_gcm_decrypt_192_avx512: + orq %r8,%r8 + je .L_enc_dec_done_yiifChpfBbxhAhe + xorq %r14,%r14 + vmovdqu64 64(%rsi),%xmm14 + + movq (%rdx),%r11 + orq %r11,%r11 + je .L_partial_block_done_EexishzBqqwurDt + movl $16,%r10d + leaq byte_len_to_mask_table(%rip),%r12 + cmpq %r10,%r8 + cmovcq %r8,%r10 + kmovw (%r12,%r10,2),%k1 + vmovdqu8 (%rcx),%xmm0{%k1}{z} + + vmovdqu64 16(%rsi),%xmm3 + vmovdqu64 336(%rsi),%xmm4 + + + + leaq SHIFT_MASK(%rip),%r12 + addq %r11,%r12 + vmovdqu64 (%r12),%xmm5 + vpshufb %xmm5,%xmm3,%xmm3 + + vmovdqa64 %xmm0,%xmm6 + vpxorq %xmm0,%xmm3,%xmm3 + + + leaq (%r8,%r11,1),%r13 + subq $16,%r13 + jge .L_no_extra_mask_EexishzBqqwurDt + subq %r13,%r12 +.L_no_extra_mask_EexishzBqqwurDt: + + + + vmovdqu64 16(%r12),%xmm0 + vpand %xmm0,%xmm3,%xmm3 + vpand %xmm0,%xmm6,%xmm6 + vpshufb SHUF_MASK(%rip),%xmm6,%xmm6 + vpshufb %xmm5,%xmm6,%xmm6 + vpxorq %xmm6,%xmm14,%xmm14 + cmpq $0,%r13 + jl .L_partial_incomplete_EexishzBqqwurDt + + vpclmulqdq $0x11,%xmm4,%xmm14,%xmm7 + vpclmulqdq $0x00,%xmm4,%xmm14,%xmm10 + vpclmulqdq $0x01,%xmm4,%xmm14,%xmm11 + vpclmulqdq $0x10,%xmm4,%xmm14,%xmm14 + vpxorq %xmm11,%xmm14,%xmm14 + + vpsrldq $8,%xmm14,%xmm11 + vpslldq $8,%xmm14,%xmm14 + vpxorq %xmm11,%xmm7,%xmm7 + vpxorq %xmm10,%xmm14,%xmm14 + + + + vmovdqu64 POLY2(%rip),%xmm11 + + vpclmulqdq $0x01,%xmm14,%xmm11,%xmm10 + vpslldq $8,%xmm10,%xmm10 + vpxorq %xmm10,%xmm14,%xmm14 + + + + vpclmulqdq $0x00,%xmm14,%xmm11,%xmm10 + vpsrldq $4,%xmm10,%xmm10 + vpclmulqdq $0x10,%xmm14,%xmm11,%xmm14 + vpslldq $4,%xmm14,%xmm14 + + vpternlogq $0x96,%xmm10,%xmm7,%xmm14 + + movq $0,(%rdx) + + movq %r11,%r12 + movq $16,%r11 + subq %r12,%r11 + jmp .L_enc_dec_done_EexishzBqqwurDt + +.L_partial_incomplete_EexishzBqqwurDt: + addq %r8,(%rdx) + movq %r8,%r11 + +.L_enc_dec_done_EexishzBqqwurDt: + + + leaq byte_len_to_mask_table(%rip),%r12 + kmovw (%r12,%r11,2),%k1 + vmovdqu64 %xmm14,64(%rsi) + movq %r9,%r12 + vmovdqu8 %xmm3,(%r12){%k1} +.L_partial_block_done_EexishzBqqwurDt: + vmovdqu64 0(%rsi),%xmm2 + subq %r11,%r8 + je .L_enc_dec_done_yiifChpfBbxhAhe + cmpq $256,%r8 + jbe .L_message_below_equal_16_blocks_yiifChpfBbxhAhe + + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vmovdqa64 ddq_addbe_4444(%rip),%zmm27 + vmovdqa64 ddq_addbe_1234(%rip),%zmm28 + + + + + + + vmovd %xmm2,%r15d + andl $255,%r15d + + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpshufb %zmm29,%zmm2,%zmm2 + + + + cmpb $240,%r15b + jae .L_next_16_overflow_tfgagBztCGiipfj + vpaddd %zmm28,%zmm2,%zmm7 + vpaddd %zmm27,%zmm7,%zmm10 + vpaddd %zmm27,%zmm10,%zmm11 + vpaddd %zmm27,%zmm11,%zmm12 + jmp .L_next_16_ok_tfgagBztCGiipfj +.L_next_16_overflow_tfgagBztCGiipfj: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm12 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm7 + vpaddd %zmm12,%zmm7,%zmm10 + vpaddd %zmm12,%zmm10,%zmm11 + vpaddd %zmm12,%zmm11,%zmm12 + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %zmm29,%zmm10,%zmm10 + vpshufb %zmm29,%zmm11,%zmm11 + vpshufb %zmm29,%zmm12,%zmm12 +.L_next_16_ok_tfgagBztCGiipfj: + vshufi64x2 $255,%zmm12,%zmm12,%zmm2 + addb $16,%r15b + + vmovdqu8 0(%rcx,%r11,1),%zmm0 + vmovdqu8 64(%rcx,%r11,1),%zmm3 + vmovdqu8 128(%rcx,%r11,1),%zmm4 + vmovdqu8 192(%rcx,%r11,1),%zmm5 + + + vbroadcastf64x2 0(%rdi),%zmm6 + vpxorq %zmm6,%zmm7,%zmm7 + vpxorq %zmm6,%zmm10,%zmm10 + vpxorq %zmm6,%zmm11,%zmm11 + vpxorq %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 16(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 32(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 48(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 64(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 80(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 96(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 112(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 128(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 144(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 160(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 176(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 192(%rdi),%zmm6 + vaesenclast %zmm6,%zmm7,%zmm7 + vaesenclast %zmm6,%zmm10,%zmm10 + vaesenclast %zmm6,%zmm11,%zmm11 + vaesenclast %zmm6,%zmm12,%zmm12 + + + vpxorq %zmm0,%zmm7,%zmm7 + vpxorq %zmm3,%zmm10,%zmm10 + vpxorq %zmm4,%zmm11,%zmm11 + vpxorq %zmm5,%zmm12,%zmm12 + + + movq %r9,%r10 + vmovdqu8 %zmm7,0(%r10,%r11,1) + vmovdqu8 %zmm10,64(%r10,%r11,1) + vmovdqu8 %zmm11,128(%r10,%r11,1) + vmovdqu8 %zmm12,192(%r10,%r11,1) + + vpshufb %zmm29,%zmm0,%zmm7 + vpshufb %zmm29,%zmm3,%zmm10 + vpshufb %zmm29,%zmm4,%zmm11 + vpshufb %zmm29,%zmm5,%zmm12 + vmovdqa64 %zmm7,768(%rsp) + vmovdqa64 %zmm10,832(%rsp) + vmovdqa64 %zmm11,896(%rsp) + vmovdqa64 %zmm12,960(%rsp) + testq %r14,%r14 + jnz .L_skip_hkeys_precomputation_wuytBaevFghAmde + + vmovdqu64 288(%rsi),%zmm0 + vmovdqu64 %zmm0,704(%rsp) + + vmovdqu64 224(%rsi),%zmm3 + vmovdqu64 %zmm3,640(%rsp) + + + vshufi64x2 $0x00,%zmm3,%zmm3,%zmm3 + + vmovdqu64 160(%rsi),%zmm4 + vmovdqu64 %zmm4,576(%rsp) + + vmovdqu64 96(%rsi),%zmm5 + vmovdqu64 %zmm5,512(%rsp) +.L_skip_hkeys_precomputation_wuytBaevFghAmde: + cmpq $512,%r8 + jb .L_message_below_32_blocks_yiifChpfBbxhAhe + + + + cmpb $240,%r15b + jae .L_next_16_overflow_nzEGCllDaFxsseu + vpaddd %zmm28,%zmm2,%zmm7 + vpaddd %zmm27,%zmm7,%zmm10 + vpaddd %zmm27,%zmm10,%zmm11 + vpaddd %zmm27,%zmm11,%zmm12 + jmp .L_next_16_ok_nzEGCllDaFxsseu +.L_next_16_overflow_nzEGCllDaFxsseu: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm12 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm7 + vpaddd %zmm12,%zmm7,%zmm10 + vpaddd %zmm12,%zmm10,%zmm11 + vpaddd %zmm12,%zmm11,%zmm12 + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %zmm29,%zmm10,%zmm10 + vpshufb %zmm29,%zmm11,%zmm11 + vpshufb %zmm29,%zmm12,%zmm12 +.L_next_16_ok_nzEGCllDaFxsseu: + vshufi64x2 $255,%zmm12,%zmm12,%zmm2 + addb $16,%r15b + + vmovdqu8 256(%rcx,%r11,1),%zmm0 + vmovdqu8 320(%rcx,%r11,1),%zmm3 + vmovdqu8 384(%rcx,%r11,1),%zmm4 + vmovdqu8 448(%rcx,%r11,1),%zmm5 + + + vbroadcastf64x2 0(%rdi),%zmm6 + vpxorq %zmm6,%zmm7,%zmm7 + vpxorq %zmm6,%zmm10,%zmm10 + vpxorq %zmm6,%zmm11,%zmm11 + vpxorq %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 16(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 32(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 48(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 64(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 80(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 96(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 112(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 128(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 144(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 160(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 176(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 192(%rdi),%zmm6 + vaesenclast %zmm6,%zmm7,%zmm7 + vaesenclast %zmm6,%zmm10,%zmm10 + vaesenclast %zmm6,%zmm11,%zmm11 + vaesenclast %zmm6,%zmm12,%zmm12 + + + vpxorq %zmm0,%zmm7,%zmm7 + vpxorq %zmm3,%zmm10,%zmm10 + vpxorq %zmm4,%zmm11,%zmm11 + vpxorq %zmm5,%zmm12,%zmm12 + + + movq %r9,%r10 + vmovdqu8 %zmm7,256(%r10,%r11,1) + vmovdqu8 %zmm10,320(%r10,%r11,1) + vmovdqu8 %zmm11,384(%r10,%r11,1) + vmovdqu8 %zmm12,448(%r10,%r11,1) + + vpshufb %zmm29,%zmm0,%zmm7 + vpshufb %zmm29,%zmm3,%zmm10 + vpshufb %zmm29,%zmm4,%zmm11 + vpshufb %zmm29,%zmm5,%zmm12 + vmovdqa64 %zmm7,1024(%rsp) + vmovdqa64 %zmm10,1088(%rsp) + vmovdqa64 %zmm11,1152(%rsp) + vmovdqa64 %zmm12,1216(%rsp) + testq %r14,%r14 + jnz .L_skip_hkeys_precomputation_CDApkmzFaysFbmb + vmovdqu64 640(%rsp),%zmm3 + + + vshufi64x2 $0x00,%zmm3,%zmm3,%zmm3 + + vmovdqu64 576(%rsp),%zmm4 + vmovdqu64 512(%rsp),%zmm5 + + vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4 + vpxorq %zmm10,%zmm4,%zmm4 + + vpsrldq $8,%zmm4,%zmm10 + vpslldq $8,%zmm4,%zmm4 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4 + vpslldq $4,%zmm4,%zmm4 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm4 + + vmovdqu64 %zmm4,448(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5 + vpxorq %zmm10,%zmm5,%zmm5 + + vpsrldq $8,%zmm5,%zmm10 + vpslldq $8,%zmm5,%zmm5 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5 + vpslldq $4,%zmm5,%zmm5 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm5 + + vmovdqu64 %zmm5,384(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4 + vpxorq %zmm10,%zmm4,%zmm4 + + vpsrldq $8,%zmm4,%zmm10 + vpslldq $8,%zmm4,%zmm4 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4 + vpslldq $4,%zmm4,%zmm4 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm4 + + vmovdqu64 %zmm4,320(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5 + vpxorq %zmm10,%zmm5,%zmm5 + + vpsrldq $8,%zmm5,%zmm10 + vpslldq $8,%zmm5,%zmm5 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5 + vpslldq $4,%zmm5,%zmm5 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm5 + + vmovdqu64 %zmm5,256(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4 + vpxorq %zmm10,%zmm4,%zmm4 + + vpsrldq $8,%zmm4,%zmm10 + vpslldq $8,%zmm4,%zmm4 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4 + vpslldq $4,%zmm4,%zmm4 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm4 + + vmovdqu64 %zmm4,192(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5 + vpxorq %zmm10,%zmm5,%zmm5 + + vpsrldq $8,%zmm5,%zmm10 + vpslldq $8,%zmm5,%zmm5 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5 + vpslldq $4,%zmm5,%zmm5 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm5 + + vmovdqu64 %zmm5,128(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4 + vpxorq %zmm10,%zmm4,%zmm4 + + vpsrldq $8,%zmm4,%zmm10 + vpslldq $8,%zmm4,%zmm4 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4 + vpslldq $4,%zmm4,%zmm4 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm4 + + vmovdqu64 %zmm4,64(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5 + vpxorq %zmm10,%zmm5,%zmm5 + + vpsrldq $8,%zmm5,%zmm10 + vpslldq $8,%zmm5,%zmm5 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5 + vpslldq $4,%zmm5,%zmm5 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm5 + + vmovdqu64 %zmm5,0(%rsp) +.L_skip_hkeys_precomputation_CDApkmzFaysFbmb: + movq $1,%r14 + addq $512,%r11 + subq $512,%r8 + + cmpq $768,%r8 + jb .L_no_more_big_nblocks_yiifChpfBbxhAhe +.L_encrypt_big_nblocks_yiifChpfBbxhAhe: + cmpb $240,%r15b + jae .L_16_blocks_overflow_EkchfDegrAlelEj + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_EkchfDegrAlelEj +.L_16_blocks_overflow_EkchfDegrAlelEj: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_EkchfDegrAlelEj: + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp),%zmm1 + + + + + vshufi64x2 $255,%zmm5,%zmm5,%zmm2 + addb $16,%r15b + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + + + + + + + + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm6 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + + + + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm15,%zmm10,%zmm26 + vpxorq %zmm12,%zmm6,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + + + + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + + + + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1) + vpshufb %zmm29,%zmm17,%zmm0 + vpshufb %zmm29,%zmm19,%zmm3 + vpshufb %zmm29,%zmm20,%zmm4 + vpshufb %zmm29,%zmm21,%zmm5 + vmovdqa64 %zmm0,1280(%rsp) + vmovdqa64 %zmm3,1344(%rsp) + vmovdqa64 %zmm4,1408(%rsp) + vmovdqa64 %zmm5,1472(%rsp) + cmpb $240,%r15b + jae .L_16_blocks_overflow_ymdbteyxuoqtqnl + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_ymdbteyxuoqtqnl +.L_16_blocks_overflow_ymdbteyxuoqtqnl: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_ymdbteyxuoqtqnl: + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 256(%rsp),%zmm1 + + + + + vshufi64x2 $255,%zmm5,%zmm5,%zmm2 + addb $16,%r15b + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 320(%rsp),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + + + + + + + + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 384(%rsp),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 448(%rsp),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm6 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + + + + vmovdqu8 256(%rcx,%r11,1),%zmm17 + vmovdqu8 320(%rcx,%r11,1),%zmm19 + vmovdqu8 384(%rcx,%r11,1),%zmm20 + vmovdqu8 448(%rcx,%r11,1),%zmm21 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vpternlogq $0x96,%zmm12,%zmm6,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + + + + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + + + + movq %r9,%r10 + vmovdqu8 %zmm0,256(%r10,%r11,1) + vmovdqu8 %zmm3,320(%r10,%r11,1) + vmovdqu8 %zmm4,384(%r10,%r11,1) + vmovdqu8 %zmm5,448(%r10,%r11,1) + vpshufb %zmm29,%zmm17,%zmm0 + vpshufb %zmm29,%zmm19,%zmm3 + vpshufb %zmm29,%zmm20,%zmm4 + vpshufb %zmm29,%zmm21,%zmm5 + vmovdqa64 %zmm0,768(%rsp) + vmovdqa64 %zmm3,832(%rsp) + vmovdqa64 %zmm4,896(%rsp) + vmovdqa64 %zmm5,960(%rsp) + cmpb $240,%r15b + jae .L_16_blocks_overflow_tyfBFhaGurfjEFr + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_tyfBFhaGurfjEFr +.L_16_blocks_overflow_tyfBFhaGurfjEFr: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_tyfBFhaGurfjEFr: + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + + + + + vshufi64x2 $255,%zmm5,%zmm5,%zmm2 + addb $16,%r15b + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + + + + + + + + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm6 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + + + + vmovdqu8 512(%rcx,%r11,1),%zmm17 + vmovdqu8 576(%rcx,%r11,1),%zmm19 + vmovdqu8 640(%rcx,%r11,1),%zmm20 + vmovdqu8 704(%rcx,%r11,1),%zmm21 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + + + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vpternlogq $0x96,%zmm15,%zmm12,%zmm6 + vpxorq %zmm24,%zmm6,%zmm6 + vpternlogq $0x96,%zmm10,%zmm13,%zmm7 + vpxorq %zmm25,%zmm7,%zmm7 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vextracti64x4 $1,%zmm6,%ymm12 + vpxorq %ymm12,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm12 + vpxorq %xmm12,%xmm6,%xmm6 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm6 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + + + + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + + + + movq %r9,%r10 + vmovdqu8 %zmm0,512(%r10,%r11,1) + vmovdqu8 %zmm3,576(%r10,%r11,1) + vmovdqu8 %zmm4,640(%r10,%r11,1) + vmovdqu8 %zmm5,704(%r10,%r11,1) + vpshufb %zmm29,%zmm17,%zmm0 + vpshufb %zmm29,%zmm19,%zmm3 + vpshufb %zmm29,%zmm20,%zmm4 + vpshufb %zmm29,%zmm21,%zmm5 + vmovdqa64 %zmm0,1024(%rsp) + vmovdqa64 %zmm3,1088(%rsp) + vmovdqa64 %zmm4,1152(%rsp) + vmovdqa64 %zmm5,1216(%rsp) + vmovdqa64 %zmm6,%zmm14 + + addq $768,%r11 + subq $768,%r8 + cmpq $768,%r8 + jae .L_encrypt_big_nblocks_yiifChpfBbxhAhe + +.L_no_more_big_nblocks_yiifChpfBbxhAhe: + + cmpq $512,%r8 + jae .L_encrypt_32_blocks_yiifChpfBbxhAhe + + cmpq $256,%r8 + jae .L_encrypt_16_blocks_yiifChpfBbxhAhe +.L_encrypt_0_blocks_ghash_32_yiifChpfBbxhAhe: + movl %r8d,%r10d + andl $~15,%r10d + movl $256,%ebx + subl %r10d,%ebx + vmovdqa64 768(%rsp),%zmm13 + vpxorq %zmm14,%zmm13,%zmm13 + vmovdqu64 0(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 832(%rsp),%zmm13 + vmovdqu64 64(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpxorq %zmm10,%zmm4,%zmm26 + vpxorq %zmm6,%zmm0,%zmm24 + vpxorq %zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 896(%rsp),%zmm13 + vmovdqu64 128(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 960(%rsp),%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + addl $256,%ebx + movl %r8d,%r10d + addl $15,%r10d + shrl $4,%r10d + je .L_last_num_blocks_is_0_EnDAnndDABDpwrg + + cmpl $8,%r10d + je .L_last_num_blocks_is_8_EnDAnndDABDpwrg + jb .L_last_num_blocks_is_7_1_EnDAnndDABDpwrg + + + cmpl $12,%r10d + je .L_last_num_blocks_is_12_EnDAnndDABDpwrg + jb .L_last_num_blocks_is_11_9_EnDAnndDABDpwrg + + + cmpl $15,%r10d + je .L_last_num_blocks_is_15_EnDAnndDABDpwrg + ja .L_last_num_blocks_is_16_EnDAnndDABDpwrg + cmpl $14,%r10d + je .L_last_num_blocks_is_14_EnDAnndDABDpwrg + jmp .L_last_num_blocks_is_13_EnDAnndDABDpwrg + +.L_last_num_blocks_is_11_9_EnDAnndDABDpwrg: + + cmpl $10,%r10d + je .L_last_num_blocks_is_10_EnDAnndDABDpwrg + ja .L_last_num_blocks_is_11_EnDAnndDABDpwrg + jmp .L_last_num_blocks_is_9_EnDAnndDABDpwrg + +.L_last_num_blocks_is_7_1_EnDAnndDABDpwrg: + cmpl $4,%r10d + je .L_last_num_blocks_is_4_EnDAnndDABDpwrg + jb .L_last_num_blocks_is_3_1_EnDAnndDABDpwrg + + cmpl $6,%r10d + ja .L_last_num_blocks_is_7_EnDAnndDABDpwrg + je .L_last_num_blocks_is_6_EnDAnndDABDpwrg + jmp .L_last_num_blocks_is_5_EnDAnndDABDpwrg + +.L_last_num_blocks_is_3_1_EnDAnndDABDpwrg: + + cmpl $2,%r10d + ja .L_last_num_blocks_is_3_EnDAnndDABDpwrg + je .L_last_num_blocks_is_2_EnDAnndDABDpwrg +.L_last_num_blocks_is_1_EnDAnndDABDpwrg: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $255,%r15d + jae .L_16_blocks_overflow_GgCAgFtCzDDmtga + vpaddd %xmm28,%xmm2,%xmm0 + jmp .L_16_blocks_ok_GgCAgFtCzDDmtga + +.L_16_blocks_overflow_GgCAgFtCzDDmtga: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %xmm29,%xmm0,%xmm0 +.L_16_blocks_ok_GgCAgFtCzDDmtga: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z} + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vaesenclast %xmm30,%xmm0,%xmm0 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti32x4 $0,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %xmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %xmm29,%xmm17,%xmm17 + vextracti32x4 $0,%zmm17,%xmm7 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_muErgpqjgcDnuvy + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_muErgpqjgcDnuvy +.L_small_initial_partial_block_muErgpqjgcDnuvy: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm0 + + + vpclmulqdq $0x01,%xmm25,%xmm0,%xmm3 + vpslldq $8,%xmm3,%xmm3 + vpxorq %xmm3,%xmm25,%xmm3 + + + vpclmulqdq $0x00,%xmm3,%xmm0,%xmm4 + vpsrldq $4,%xmm4,%xmm4 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm4,%xmm14 + + + + + + + + + + + + + vpxorq %xmm7,%xmm14,%xmm14 + + jmp .L_after_reduction_muErgpqjgcDnuvy +.L_small_initial_compute_done_muErgpqjgcDnuvy: +.L_after_reduction_muErgpqjgcDnuvy: + jmp .L_last_blocks_done_EnDAnndDABDpwrg +.L_last_num_blocks_is_2_EnDAnndDABDpwrg: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $254,%r15d + jae .L_16_blocks_overflow_sGdlxeauwrjkrtA + vpaddd %ymm28,%ymm2,%ymm0 + jmp .L_16_blocks_ok_sGdlxeauwrjkrtA + +.L_16_blocks_overflow_sGdlxeauwrjkrtA: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %ymm29,%ymm0,%ymm0 +.L_16_blocks_ok_sGdlxeauwrjkrtA: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z} + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vaesenclast %ymm30,%ymm0,%ymm0 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %ymm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %ymm29,%ymm17,%ymm17 + vextracti32x4 $1,%zmm17,%xmm7 + subq $16 * (2 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_mixrqrhnvplnBsa + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_mixrqrhnvplnBsa +.L_small_initial_partial_block_mixrqrhnvplnBsa: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_mixrqrhnvplnBsa: + + orq %r8,%r8 + je .L_after_reduction_mixrqrhnvplnBsa + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_mixrqrhnvplnBsa: + jmp .L_last_blocks_done_EnDAnndDABDpwrg +.L_last_num_blocks_is_3_EnDAnndDABDpwrg: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $253,%r15d + jae .L_16_blocks_overflow_whibjFbDFpmwsdg + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_whibjFbDFpmwsdg + +.L_16_blocks_overflow_whibjFbDFpmwsdg: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_whibjFbDFpmwsdg: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $2,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vextracti32x4 $2,%zmm17,%xmm7 + subq $16 * (3 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_lAnoBCFfkdkhBpw + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_lAnoBCFfkdkhBpw +.L_small_initial_partial_block_lAnoBCFfkdkhBpw: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_lAnoBCFfkdkhBpw: + + orq %r8,%r8 + je .L_after_reduction_lAnoBCFfkdkhBpw + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_lAnoBCFfkdkhBpw: + jmp .L_last_blocks_done_EnDAnndDABDpwrg +.L_last_num_blocks_is_4_EnDAnndDABDpwrg: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $252,%r15d + jae .L_16_blocks_overflow_CACaGmtylGFBBes + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_CACaGmtylGFBBes + +.L_16_blocks_overflow_CACaGmtylGFBBes: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_CACaGmtylGFBBes: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $3,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vextracti32x4 $3,%zmm17,%xmm7 + subq $16 * (4 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_bDpjzbsFvemyBzb + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_bDpjzbsFvemyBzb +.L_small_initial_partial_block_bDpjzbsFvemyBzb: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_bDpjzbsFvemyBzb: + + orq %r8,%r8 + je .L_after_reduction_bDpjzbsFvemyBzb + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_bDpjzbsFvemyBzb: + jmp .L_last_blocks_done_EnDAnndDABDpwrg +.L_last_num_blocks_is_5_EnDAnndDABDpwrg: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $251,%r15d + jae .L_16_blocks_overflow_imFzBFrgiBtDFwx + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %xmm27,%xmm0,%xmm3 + jmp .L_16_blocks_ok_imFzBFrgiBtDFwx + +.L_16_blocks_overflow_imFzBFrgiBtDFwx: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %xmm29,%xmm3,%xmm3 +.L_16_blocks_ok_imFzBFrgiBtDFwx: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %xmm30,%xmm3,%xmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %xmm19,%xmm3,%xmm3 + vextracti32x4 $0,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %xmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %xmm29,%xmm19,%xmm19 + vextracti32x4 $0,%zmm19,%xmm7 + subq $16 * (5 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_vnnCjDqmzbcdpik + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_vnnCjDqmzbcdpik +.L_small_initial_partial_block_vnnCjDqmzbcdpik: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_vnnCjDqmzbcdpik: + + orq %r8,%r8 + je .L_after_reduction_vnnCjDqmzbcdpik + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_vnnCjDqmzbcdpik: + jmp .L_last_blocks_done_EnDAnndDABDpwrg +.L_last_num_blocks_is_6_EnDAnndDABDpwrg: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $250,%r15d + jae .L_16_blocks_overflow_pAdtiatocvAeptw + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %ymm27,%ymm0,%ymm3 + jmp .L_16_blocks_ok_pAdtiatocvAeptw + +.L_16_blocks_overflow_pAdtiatocvAeptw: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %ymm29,%ymm3,%ymm3 +.L_16_blocks_ok_pAdtiatocvAeptw: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %ymm30,%ymm3,%ymm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %ymm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %ymm29,%ymm19,%ymm19 + vextracti32x4 $1,%zmm19,%xmm7 + subq $16 * (6 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_gvfhgipCiigqdGj + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_gvfhgipCiigqdGj +.L_small_initial_partial_block_gvfhgipCiigqdGj: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_gvfhgipCiigqdGj: + + orq %r8,%r8 + je .L_after_reduction_gvfhgipCiigqdGj + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_gvfhgipCiigqdGj: + jmp .L_last_blocks_done_EnDAnndDABDpwrg +.L_last_num_blocks_is_7_EnDAnndDABDpwrg: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $249,%r15d + jae .L_16_blocks_overflow_xxGFqeesBsuBajd + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_xxGFqeesBsuBajd + +.L_16_blocks_overflow_xxGFqeesBsuBajd: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_xxGFqeesBsuBajd: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $2,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vextracti32x4 $2,%zmm19,%xmm7 + subq $16 * (7 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_nFyvcbadpdjqnGl + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_nFyvcbadpdjqnGl +.L_small_initial_partial_block_nFyvcbadpdjqnGl: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_nFyvcbadpdjqnGl: + + orq %r8,%r8 + je .L_after_reduction_nFyvcbadpdjqnGl + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_nFyvcbadpdjqnGl: + jmp .L_last_blocks_done_EnDAnndDABDpwrg +.L_last_num_blocks_is_8_EnDAnndDABDpwrg: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $248,%r15d + jae .L_16_blocks_overflow_qtzDbmlGiqglyFC + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_qtzDbmlGiqglyFC + +.L_16_blocks_overflow_qtzDbmlGiqglyFC: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_qtzDbmlGiqglyFC: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $3,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vextracti32x4 $3,%zmm19,%xmm7 + subq $16 * (8 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_jhfdGzoqFGvFnBz + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_jhfdGzoqFGvFnBz +.L_small_initial_partial_block_jhfdGzoqFGvFnBz: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_jhfdGzoqFGvFnBz: + + orq %r8,%r8 + je .L_after_reduction_jhfdGzoqFGvFnBz + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_jhfdGzoqFGvFnBz: + jmp .L_last_blocks_done_EnDAnndDABDpwrg +.L_last_num_blocks_is_9_EnDAnndDABDpwrg: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $247,%r15d + jae .L_16_blocks_overflow_wmBlfbGwbkoxgju + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %xmm27,%xmm3,%xmm4 + jmp .L_16_blocks_ok_wmBlfbGwbkoxgju + +.L_16_blocks_overflow_wmBlfbGwbkoxgju: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %xmm29,%xmm4,%xmm4 +.L_16_blocks_ok_wmBlfbGwbkoxgju: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %xmm30,%xmm4,%xmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %xmm20,%xmm4,%xmm4 + vextracti32x4 $0,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %xmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %xmm29,%xmm20,%xmm20 + vextracti32x4 $0,%zmm20,%xmm7 + subq $16 * (9 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_baszqDAmduvhiiE + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_baszqDAmduvhiiE +.L_small_initial_partial_block_baszqDAmduvhiiE: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_baszqDAmduvhiiE: + + orq %r8,%r8 + je .L_after_reduction_baszqDAmduvhiiE + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_baszqDAmduvhiiE: + jmp .L_last_blocks_done_EnDAnndDABDpwrg +.L_last_num_blocks_is_10_EnDAnndDABDpwrg: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $246,%r15d + jae .L_16_blocks_overflow_stwxpAgbfshrvAC + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %ymm27,%ymm3,%ymm4 + jmp .L_16_blocks_ok_stwxpAgbfshrvAC + +.L_16_blocks_overflow_stwxpAgbfshrvAC: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %ymm29,%ymm4,%ymm4 +.L_16_blocks_ok_stwxpAgbfshrvAC: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %ymm30,%ymm4,%ymm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %ymm20,%ymm4,%ymm4 + vextracti32x4 $1,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %ymm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %ymm29,%ymm20,%ymm20 + vextracti32x4 $1,%zmm20,%xmm7 + subq $16 * (10 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_exAeuCGujFxiqAh + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_exAeuCGujFxiqAh +.L_small_initial_partial_block_exAeuCGujFxiqAh: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_exAeuCGujFxiqAh: + + orq %r8,%r8 + je .L_after_reduction_exAeuCGujFxiqAh + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_exAeuCGujFxiqAh: + jmp .L_last_blocks_done_EnDAnndDABDpwrg +.L_last_num_blocks_is_11_EnDAnndDABDpwrg: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $245,%r15d + jae .L_16_blocks_overflow_AxBbgslpvfAEaln + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_AxBbgslpvfAEaln + +.L_16_blocks_overflow_AxBbgslpvfAEaln: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_AxBbgslpvfAEaln: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $2,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vextracti32x4 $2,%zmm20,%xmm7 + subq $16 * (11 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_DbcpAfrkzFcgwwp + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_DbcpAfrkzFcgwwp +.L_small_initial_partial_block_DbcpAfrkzFcgwwp: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_DbcpAfrkzFcgwwp: + + orq %r8,%r8 + je .L_after_reduction_DbcpAfrkzFcgwwp + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_DbcpAfrkzFcgwwp: + jmp .L_last_blocks_done_EnDAnndDABDpwrg +.L_last_num_blocks_is_12_EnDAnndDABDpwrg: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $244,%r15d + jae .L_16_blocks_overflow_smrhssarGEoyasa + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_smrhssarGEoyasa + +.L_16_blocks_overflow_smrhssarGEoyasa: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_smrhssarGEoyasa: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $3,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vextracti32x4 $3,%zmm20,%xmm7 + subq $16 * (12 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_rouvbBEfwtDrsEg + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_rouvbBEfwtDrsEg +.L_small_initial_partial_block_rouvbBEfwtDrsEg: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_rouvbBEfwtDrsEg: + + orq %r8,%r8 + je .L_after_reduction_rouvbBEfwtDrsEg + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_rouvbBEfwtDrsEg: + jmp .L_last_blocks_done_EnDAnndDABDpwrg +.L_last_num_blocks_is_13_EnDAnndDABDpwrg: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $243,%r15d + jae .L_16_blocks_overflow_DrfxGvBzxdbnqak + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %xmm27,%xmm4,%xmm5 + jmp .L_16_blocks_ok_DrfxGvBzxdbnqak + +.L_16_blocks_overflow_DrfxGvBzxdbnqak: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %xmm29,%xmm5,%xmm5 +.L_16_blocks_ok_DrfxGvBzxdbnqak: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %xmm30,%xmm5,%xmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %xmm21,%xmm5,%xmm5 + vextracti32x4 $0,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %xmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %xmm29,%xmm21,%xmm21 + vextracti32x4 $0,%zmm21,%xmm7 + subq $16 * (13 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_wcayAkkuiehcgnC + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_wcayAkkuiehcgnC +.L_small_initial_partial_block_wcayAkkuiehcgnC: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_wcayAkkuiehcgnC: + + orq %r8,%r8 + je .L_after_reduction_wcayAkkuiehcgnC + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_wcayAkkuiehcgnC: + jmp .L_last_blocks_done_EnDAnndDABDpwrg +.L_last_num_blocks_is_14_EnDAnndDABDpwrg: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $242,%r15d + jae .L_16_blocks_overflow_kAcyvjjAkbnGGoE + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %ymm27,%ymm4,%ymm5 + jmp .L_16_blocks_ok_kAcyvjjAkbnGGoE + +.L_16_blocks_overflow_kAcyvjjAkbnGGoE: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %ymm29,%ymm5,%ymm5 +.L_16_blocks_ok_kAcyvjjAkbnGGoE: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %ymm30,%ymm5,%ymm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %ymm21,%ymm5,%ymm5 + vextracti32x4 $1,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %ymm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %ymm29,%ymm21,%ymm21 + vextracti32x4 $1,%zmm21,%xmm7 + subq $16 * (14 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_lECstFkGozakhDE + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_lECstFkGozakhDE +.L_small_initial_partial_block_lECstFkGozakhDE: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_lECstFkGozakhDE: + + orq %r8,%r8 + je .L_after_reduction_lECstFkGozakhDE + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_lECstFkGozakhDE: + jmp .L_last_blocks_done_EnDAnndDABDpwrg +.L_last_num_blocks_is_15_EnDAnndDABDpwrg: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $241,%r15d + jae .L_16_blocks_overflow_uvsntmjBtmwoAgA + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_uvsntmjBtmwoAgA + +.L_16_blocks_overflow_uvsntmjBtmwoAgA: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_uvsntmjBtmwoAgA: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $2,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %zmm29,%zmm21,%zmm21 + vextracti32x4 $2,%zmm21,%xmm7 + subq $16 * (15 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_gFfyGkDCahpvfAe + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_gFfyGkDCahpvfAe +.L_small_initial_partial_block_gFfyGkDCahpvfAe: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_gFfyGkDCahpvfAe: + + orq %r8,%r8 + je .L_after_reduction_gFfyGkDCahpvfAe + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_gFfyGkDCahpvfAe: + jmp .L_last_blocks_done_EnDAnndDABDpwrg +.L_last_num_blocks_is_16_EnDAnndDABDpwrg: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $240,%r15d + jae .L_16_blocks_overflow_jwffjzkjrdbGmqd + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_jwffjzkjrdbGmqd + +.L_16_blocks_overflow_jwffjzkjrdbGmqd: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_jwffjzkjrdbGmqd: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $3,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %zmm29,%zmm21,%zmm21 + vextracti32x4 $3,%zmm21,%xmm7 + subq $16 * (16 - 1),%r8 +.L_small_initial_partial_block_ccvdpppmDomgiCD: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_ccvdpppmDomgiCD: + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_ccvdpppmDomgiCD: + jmp .L_last_blocks_done_EnDAnndDABDpwrg +.L_last_num_blocks_is_0_EnDAnndDABDpwrg: + vmovdqa64 1024(%rsp),%zmm13 + vmovdqu64 0(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1088(%rsp),%zmm13 + vmovdqu64 64(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 1152(%rsp),%zmm13 + vmovdqu64 128(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1216(%rsp),%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm4 + + + vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0 + vpslldq $8,%xmm0,%xmm0 + vpxorq %xmm0,%xmm25,%xmm0 + + + vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3 + vpsrldq $4,%xmm3,%xmm3 + vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm3,%xmm14 + +.L_last_blocks_done_EnDAnndDABDpwrg: + vpshufb %xmm29,%xmm2,%xmm2 + jmp .L_ghash_done_yiifChpfBbxhAhe +.L_encrypt_32_blocks_yiifChpfBbxhAhe: + cmpb $240,%r15b + jae .L_16_blocks_overflow_igclhxhftlBGfml + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_igclhxhftlBGfml +.L_16_blocks_overflow_igclhxhftlBGfml: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_igclhxhftlBGfml: + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp),%zmm1 + + + + + vshufi64x2 $255,%zmm5,%zmm5,%zmm2 + addb $16,%r15b + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + + + + + + + + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm6 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + + + + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm15,%zmm10,%zmm26 + vpxorq %zmm12,%zmm6,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + + + + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + + + + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1) + vpshufb %zmm29,%zmm17,%zmm0 + vpshufb %zmm29,%zmm19,%zmm3 + vpshufb %zmm29,%zmm20,%zmm4 + vpshufb %zmm29,%zmm21,%zmm5 + vmovdqa64 %zmm0,1280(%rsp) + vmovdqa64 %zmm3,1344(%rsp) + vmovdqa64 %zmm4,1408(%rsp) + vmovdqa64 %zmm5,1472(%rsp) + cmpb $240,%r15b + jae .L_16_blocks_overflow_hgchDvhDwhDhkhj + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_hgchDvhDwhDhkhj +.L_16_blocks_overflow_hgchDvhDwhDhkhj: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_hgchDvhDwhDhkhj: + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 256(%rsp),%zmm1 + + + + + vshufi64x2 $255,%zmm5,%zmm5,%zmm2 + addb $16,%r15b + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 320(%rsp),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + + + + + + + + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 384(%rsp),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 448(%rsp),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm6 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + + + + vmovdqu8 256(%rcx,%r11,1),%zmm17 + vmovdqu8 320(%rcx,%r11,1),%zmm19 + vmovdqu8 384(%rcx,%r11,1),%zmm20 + vmovdqu8 448(%rcx,%r11,1),%zmm21 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vpternlogq $0x96,%zmm12,%zmm6,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + + + + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + + + + movq %r9,%r10 + vmovdqu8 %zmm0,256(%r10,%r11,1) + vmovdqu8 %zmm3,320(%r10,%r11,1) + vmovdqu8 %zmm4,384(%r10,%r11,1) + vmovdqu8 %zmm5,448(%r10,%r11,1) + vpshufb %zmm29,%zmm17,%zmm0 + vpshufb %zmm29,%zmm19,%zmm3 + vpshufb %zmm29,%zmm20,%zmm4 + vpshufb %zmm29,%zmm21,%zmm5 + vmovdqa64 %zmm0,768(%rsp) + vmovdqa64 %zmm3,832(%rsp) + vmovdqa64 %zmm4,896(%rsp) + vmovdqa64 %zmm5,960(%rsp) + vmovdqa64 1280(%rsp),%zmm13 + vmovdqu64 512(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1344(%rsp),%zmm13 + vmovdqu64 576(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 1408(%rsp),%zmm13 + vmovdqu64 640(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1472(%rsp),%zmm13 + vmovdqu64 704(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm4 + + + vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0 + vpslldq $8,%xmm0,%xmm0 + vpxorq %xmm0,%xmm25,%xmm0 + + + vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3 + vpsrldq $4,%xmm3,%xmm3 + vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm3,%xmm14 + + subq $512,%r8 + addq $512,%r11 + movl %r8d,%r10d + andl $~15,%r10d + movl $512,%ebx + subl %r10d,%ebx + movl %r8d,%r10d + addl $15,%r10d + shrl $4,%r10d + je .L_last_num_blocks_is_0_mzebEnFmrFgqunA + + cmpl $8,%r10d + je .L_last_num_blocks_is_8_mzebEnFmrFgqunA + jb .L_last_num_blocks_is_7_1_mzebEnFmrFgqunA + + + cmpl $12,%r10d + je .L_last_num_blocks_is_12_mzebEnFmrFgqunA + jb .L_last_num_blocks_is_11_9_mzebEnFmrFgqunA + + + cmpl $15,%r10d + je .L_last_num_blocks_is_15_mzebEnFmrFgqunA + ja .L_last_num_blocks_is_16_mzebEnFmrFgqunA + cmpl $14,%r10d + je .L_last_num_blocks_is_14_mzebEnFmrFgqunA + jmp .L_last_num_blocks_is_13_mzebEnFmrFgqunA + +.L_last_num_blocks_is_11_9_mzebEnFmrFgqunA: + + cmpl $10,%r10d + je .L_last_num_blocks_is_10_mzebEnFmrFgqunA + ja .L_last_num_blocks_is_11_mzebEnFmrFgqunA + jmp .L_last_num_blocks_is_9_mzebEnFmrFgqunA + +.L_last_num_blocks_is_7_1_mzebEnFmrFgqunA: + cmpl $4,%r10d + je .L_last_num_blocks_is_4_mzebEnFmrFgqunA + jb .L_last_num_blocks_is_3_1_mzebEnFmrFgqunA + + cmpl $6,%r10d + ja .L_last_num_blocks_is_7_mzebEnFmrFgqunA + je .L_last_num_blocks_is_6_mzebEnFmrFgqunA + jmp .L_last_num_blocks_is_5_mzebEnFmrFgqunA + +.L_last_num_blocks_is_3_1_mzebEnFmrFgqunA: + + cmpl $2,%r10d + ja .L_last_num_blocks_is_3_mzebEnFmrFgqunA + je .L_last_num_blocks_is_2_mzebEnFmrFgqunA +.L_last_num_blocks_is_1_mzebEnFmrFgqunA: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $255,%r15d + jae .L_16_blocks_overflow_nGCoqEFBGnmxbxd + vpaddd %xmm28,%xmm2,%xmm0 + jmp .L_16_blocks_ok_nGCoqEFBGnmxbxd + +.L_16_blocks_overflow_nGCoqEFBGnmxbxd: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %xmm29,%xmm0,%xmm0 +.L_16_blocks_ok_nGCoqEFBGnmxbxd: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z} + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vaesenclast %xmm30,%xmm0,%xmm0 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti32x4 $0,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %xmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %xmm29,%xmm17,%xmm17 + vextracti32x4 $0,%zmm17,%xmm7 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_pteDFgEDjspDekt + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_pteDFgEDjspDekt +.L_small_initial_partial_block_pteDFgEDjspDekt: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm0 + + + vpclmulqdq $0x01,%xmm25,%xmm0,%xmm3 + vpslldq $8,%xmm3,%xmm3 + vpxorq %xmm3,%xmm25,%xmm3 + + + vpclmulqdq $0x00,%xmm3,%xmm0,%xmm4 + vpsrldq $4,%xmm4,%xmm4 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm4,%xmm14 + + + + + + + + + + + + + vpxorq %xmm7,%xmm14,%xmm14 + + jmp .L_after_reduction_pteDFgEDjspDekt +.L_small_initial_compute_done_pteDFgEDjspDekt: +.L_after_reduction_pteDFgEDjspDekt: + jmp .L_last_blocks_done_mzebEnFmrFgqunA +.L_last_num_blocks_is_2_mzebEnFmrFgqunA: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $254,%r15d + jae .L_16_blocks_overflow_BnoeeeAuxpuGrCd + vpaddd %ymm28,%ymm2,%ymm0 + jmp .L_16_blocks_ok_BnoeeeAuxpuGrCd + +.L_16_blocks_overflow_BnoeeeAuxpuGrCd: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %ymm29,%ymm0,%ymm0 +.L_16_blocks_ok_BnoeeeAuxpuGrCd: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z} + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vaesenclast %ymm30,%ymm0,%ymm0 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %ymm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %ymm29,%ymm17,%ymm17 + vextracti32x4 $1,%zmm17,%xmm7 + subq $16 * (2 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_pGCaGvdapDriFwq + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_pGCaGvdapDriFwq +.L_small_initial_partial_block_pGCaGvdapDriFwq: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_pGCaGvdapDriFwq: + + orq %r8,%r8 + je .L_after_reduction_pGCaGvdapDriFwq + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_pGCaGvdapDriFwq: + jmp .L_last_blocks_done_mzebEnFmrFgqunA +.L_last_num_blocks_is_3_mzebEnFmrFgqunA: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $253,%r15d + jae .L_16_blocks_overflow_rpvBmmdleounkfg + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_rpvBmmdleounkfg + +.L_16_blocks_overflow_rpvBmmdleounkfg: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_rpvBmmdleounkfg: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $2,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vextracti32x4 $2,%zmm17,%xmm7 + subq $16 * (3 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_EDfFbxCoAeBbBmG + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_EDfFbxCoAeBbBmG +.L_small_initial_partial_block_EDfFbxCoAeBbBmG: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_EDfFbxCoAeBbBmG: + + orq %r8,%r8 + je .L_after_reduction_EDfFbxCoAeBbBmG + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_EDfFbxCoAeBbBmG: + jmp .L_last_blocks_done_mzebEnFmrFgqunA +.L_last_num_blocks_is_4_mzebEnFmrFgqunA: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $252,%r15d + jae .L_16_blocks_overflow_eejufxFfpkhainn + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_eejufxFfpkhainn + +.L_16_blocks_overflow_eejufxFfpkhainn: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_eejufxFfpkhainn: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $3,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vextracti32x4 $3,%zmm17,%xmm7 + subq $16 * (4 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_rtqFkraGudeyaFm + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_rtqFkraGudeyaFm +.L_small_initial_partial_block_rtqFkraGudeyaFm: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_rtqFkraGudeyaFm: + + orq %r8,%r8 + je .L_after_reduction_rtqFkraGudeyaFm + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_rtqFkraGudeyaFm: + jmp .L_last_blocks_done_mzebEnFmrFgqunA +.L_last_num_blocks_is_5_mzebEnFmrFgqunA: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $251,%r15d + jae .L_16_blocks_overflow_bgofyFpgEnsntBw + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %xmm27,%xmm0,%xmm3 + jmp .L_16_blocks_ok_bgofyFpgEnsntBw + +.L_16_blocks_overflow_bgofyFpgEnsntBw: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %xmm29,%xmm3,%xmm3 +.L_16_blocks_ok_bgofyFpgEnsntBw: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %xmm30,%xmm3,%xmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %xmm19,%xmm3,%xmm3 + vextracti32x4 $0,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %xmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %xmm29,%xmm19,%xmm19 + vextracti32x4 $0,%zmm19,%xmm7 + subq $16 * (5 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_uCfkbGGrphGcGba + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_uCfkbGGrphGcGba +.L_small_initial_partial_block_uCfkbGGrphGcGba: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_uCfkbGGrphGcGba: + + orq %r8,%r8 + je .L_after_reduction_uCfkbGGrphGcGba + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_uCfkbGGrphGcGba: + jmp .L_last_blocks_done_mzebEnFmrFgqunA +.L_last_num_blocks_is_6_mzebEnFmrFgqunA: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $250,%r15d + jae .L_16_blocks_overflow_GvptlszrGgmFuve + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %ymm27,%ymm0,%ymm3 + jmp .L_16_blocks_ok_GvptlszrGgmFuve + +.L_16_blocks_overflow_GvptlszrGgmFuve: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %ymm29,%ymm3,%ymm3 +.L_16_blocks_ok_GvptlszrGgmFuve: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %ymm30,%ymm3,%ymm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %ymm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %ymm29,%ymm19,%ymm19 + vextracti32x4 $1,%zmm19,%xmm7 + subq $16 * (6 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_oFAlvAhpbuuoctp + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_oFAlvAhpbuuoctp +.L_small_initial_partial_block_oFAlvAhpbuuoctp: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_oFAlvAhpbuuoctp: + + orq %r8,%r8 + je .L_after_reduction_oFAlvAhpbuuoctp + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_oFAlvAhpbuuoctp: + jmp .L_last_blocks_done_mzebEnFmrFgqunA +.L_last_num_blocks_is_7_mzebEnFmrFgqunA: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $249,%r15d + jae .L_16_blocks_overflow_DxbjcygrgxudEjb + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_DxbjcygrgxudEjb + +.L_16_blocks_overflow_DxbjcygrgxudEjb: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_DxbjcygrgxudEjb: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $2,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vextracti32x4 $2,%zmm19,%xmm7 + subq $16 * (7 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_xFeGbEcEyBujjsd + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_xFeGbEcEyBujjsd +.L_small_initial_partial_block_xFeGbEcEyBujjsd: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_xFeGbEcEyBujjsd: + + orq %r8,%r8 + je .L_after_reduction_xFeGbEcEyBujjsd + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_xFeGbEcEyBujjsd: + jmp .L_last_blocks_done_mzebEnFmrFgqunA +.L_last_num_blocks_is_8_mzebEnFmrFgqunA: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $248,%r15d + jae .L_16_blocks_overflow_njjFmdkzFAzEDDa + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_njjFmdkzFAzEDDa + +.L_16_blocks_overflow_njjFmdkzFAzEDDa: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_njjFmdkzFAzEDDa: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $3,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vextracti32x4 $3,%zmm19,%xmm7 + subq $16 * (8 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_ozrwtEFqpzbbFif + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_ozrwtEFqpzbbFif +.L_small_initial_partial_block_ozrwtEFqpzbbFif: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_ozrwtEFqpzbbFif: + + orq %r8,%r8 + je .L_after_reduction_ozrwtEFqpzbbFif + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_ozrwtEFqpzbbFif: + jmp .L_last_blocks_done_mzebEnFmrFgqunA +.L_last_num_blocks_is_9_mzebEnFmrFgqunA: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $247,%r15d + jae .L_16_blocks_overflow_tzqaclAtnqeEABy + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %xmm27,%xmm3,%xmm4 + jmp .L_16_blocks_ok_tzqaclAtnqeEABy + +.L_16_blocks_overflow_tzqaclAtnqeEABy: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %xmm29,%xmm4,%xmm4 +.L_16_blocks_ok_tzqaclAtnqeEABy: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %xmm30,%xmm4,%xmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %xmm20,%xmm4,%xmm4 + vextracti32x4 $0,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %xmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %xmm29,%xmm20,%xmm20 + vextracti32x4 $0,%zmm20,%xmm7 + subq $16 * (9 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_akxrmDCvAwmtoBq + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_akxrmDCvAwmtoBq +.L_small_initial_partial_block_akxrmDCvAwmtoBq: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_akxrmDCvAwmtoBq: + + orq %r8,%r8 + je .L_after_reduction_akxrmDCvAwmtoBq + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_akxrmDCvAwmtoBq: + jmp .L_last_blocks_done_mzebEnFmrFgqunA +.L_last_num_blocks_is_10_mzebEnFmrFgqunA: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $246,%r15d + jae .L_16_blocks_overflow_mdrttBDhusakuks + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %ymm27,%ymm3,%ymm4 + jmp .L_16_blocks_ok_mdrttBDhusakuks + +.L_16_blocks_overflow_mdrttBDhusakuks: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %ymm29,%ymm4,%ymm4 +.L_16_blocks_ok_mdrttBDhusakuks: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %ymm30,%ymm4,%ymm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %ymm20,%ymm4,%ymm4 + vextracti32x4 $1,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %ymm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %ymm29,%ymm20,%ymm20 + vextracti32x4 $1,%zmm20,%xmm7 + subq $16 * (10 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_iAgGclofsEyxAFd + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_iAgGclofsEyxAFd +.L_small_initial_partial_block_iAgGclofsEyxAFd: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_iAgGclofsEyxAFd: + + orq %r8,%r8 + je .L_after_reduction_iAgGclofsEyxAFd + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_iAgGclofsEyxAFd: + jmp .L_last_blocks_done_mzebEnFmrFgqunA +.L_last_num_blocks_is_11_mzebEnFmrFgqunA: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $245,%r15d + jae .L_16_blocks_overflow_dngFDcgnxjanBrr + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_dngFDcgnxjanBrr + +.L_16_blocks_overflow_dngFDcgnxjanBrr: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_dngFDcgnxjanBrr: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $2,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vextracti32x4 $2,%zmm20,%xmm7 + subq $16 * (11 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_okvBnGbFccGxioi + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_okvBnGbFccGxioi +.L_small_initial_partial_block_okvBnGbFccGxioi: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_okvBnGbFccGxioi: + + orq %r8,%r8 + je .L_after_reduction_okvBnGbFccGxioi + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_okvBnGbFccGxioi: + jmp .L_last_blocks_done_mzebEnFmrFgqunA +.L_last_num_blocks_is_12_mzebEnFmrFgqunA: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $244,%r15d + jae .L_16_blocks_overflow_aubdtmlCEjgrkqC + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_aubdtmlCEjgrkqC + +.L_16_blocks_overflow_aubdtmlCEjgrkqC: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_aubdtmlCEjgrkqC: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $3,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vextracti32x4 $3,%zmm20,%xmm7 + subq $16 * (12 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_fAvjEssplkpFDzu + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_fAvjEssplkpFDzu +.L_small_initial_partial_block_fAvjEssplkpFDzu: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_fAvjEssplkpFDzu: + + orq %r8,%r8 + je .L_after_reduction_fAvjEssplkpFDzu + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_fAvjEssplkpFDzu: + jmp .L_last_blocks_done_mzebEnFmrFgqunA +.L_last_num_blocks_is_13_mzebEnFmrFgqunA: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $243,%r15d + jae .L_16_blocks_overflow_tgGfmxsfvvfjlut + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %xmm27,%xmm4,%xmm5 + jmp .L_16_blocks_ok_tgGfmxsfvvfjlut + +.L_16_blocks_overflow_tgGfmxsfvvfjlut: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %xmm29,%xmm5,%xmm5 +.L_16_blocks_ok_tgGfmxsfvvfjlut: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %xmm30,%xmm5,%xmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %xmm21,%xmm5,%xmm5 + vextracti32x4 $0,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %xmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %xmm29,%xmm21,%xmm21 + vextracti32x4 $0,%zmm21,%xmm7 + subq $16 * (13 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_dGgFeCerpjagCtb + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_dGgFeCerpjagCtb +.L_small_initial_partial_block_dGgFeCerpjagCtb: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_dGgFeCerpjagCtb: + + orq %r8,%r8 + je .L_after_reduction_dGgFeCerpjagCtb + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_dGgFeCerpjagCtb: + jmp .L_last_blocks_done_mzebEnFmrFgqunA +.L_last_num_blocks_is_14_mzebEnFmrFgqunA: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $242,%r15d + jae .L_16_blocks_overflow_GjeuEqvcyhCdAlB + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %ymm27,%ymm4,%ymm5 + jmp .L_16_blocks_ok_GjeuEqvcyhCdAlB + +.L_16_blocks_overflow_GjeuEqvcyhCdAlB: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %ymm29,%ymm5,%ymm5 +.L_16_blocks_ok_GjeuEqvcyhCdAlB: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %ymm30,%ymm5,%ymm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %ymm21,%ymm5,%ymm5 + vextracti32x4 $1,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %ymm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %ymm29,%ymm21,%ymm21 + vextracti32x4 $1,%zmm21,%xmm7 + subq $16 * (14 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_CbnaspueplphnCn + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_CbnaspueplphnCn +.L_small_initial_partial_block_CbnaspueplphnCn: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_CbnaspueplphnCn: + + orq %r8,%r8 + je .L_after_reduction_CbnaspueplphnCn + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_CbnaspueplphnCn: + jmp .L_last_blocks_done_mzebEnFmrFgqunA +.L_last_num_blocks_is_15_mzebEnFmrFgqunA: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $241,%r15d + jae .L_16_blocks_overflow_vduCxcjofxGqAou + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_vduCxcjofxGqAou + +.L_16_blocks_overflow_vduCxcjofxGqAou: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_vduCxcjofxGqAou: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $2,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %zmm29,%zmm21,%zmm21 + vextracti32x4 $2,%zmm21,%xmm7 + subq $16 * (15 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_xdoEhGjsfscahrp + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_xdoEhGjsfscahrp +.L_small_initial_partial_block_xdoEhGjsfscahrp: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_xdoEhGjsfscahrp: + + orq %r8,%r8 + je .L_after_reduction_xdoEhGjsfscahrp + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_xdoEhGjsfscahrp: + jmp .L_last_blocks_done_mzebEnFmrFgqunA +.L_last_num_blocks_is_16_mzebEnFmrFgqunA: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $240,%r15d + jae .L_16_blocks_overflow_skEyjqiskGfxdvC + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_skEyjqiskGfxdvC + +.L_16_blocks_overflow_skEyjqiskGfxdvC: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_skEyjqiskGfxdvC: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $3,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %zmm29,%zmm21,%zmm21 + vextracti32x4 $3,%zmm21,%xmm7 + subq $16 * (16 - 1),%r8 +.L_small_initial_partial_block_dxixdfuDqivveAt: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_dxixdfuDqivveAt: + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_dxixdfuDqivveAt: + jmp .L_last_blocks_done_mzebEnFmrFgqunA +.L_last_num_blocks_is_0_mzebEnFmrFgqunA: + vmovdqa64 768(%rsp),%zmm13 + vpxorq %zmm14,%zmm13,%zmm13 + vmovdqu64 0(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 832(%rsp),%zmm13 + vmovdqu64 64(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpxorq %zmm10,%zmm4,%zmm26 + vpxorq %zmm6,%zmm0,%zmm24 + vpxorq %zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 896(%rsp),%zmm13 + vmovdqu64 128(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 960(%rsp),%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm4 + + + vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0 + vpslldq $8,%xmm0,%xmm0 + vpxorq %xmm0,%xmm25,%xmm0 + + + vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3 + vpsrldq $4,%xmm3,%xmm3 + vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm3,%xmm14 + +.L_last_blocks_done_mzebEnFmrFgqunA: + vpshufb %xmm29,%xmm2,%xmm2 + jmp .L_ghash_done_yiifChpfBbxhAhe +.L_encrypt_16_blocks_yiifChpfBbxhAhe: + cmpb $240,%r15b + jae .L_16_blocks_overflow_lGoEsFGcBhBnEgo + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_lGoEsFGcBhBnEgo +.L_16_blocks_overflow_lGoEsFGcBhBnEgo: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_lGoEsFGcBhBnEgo: + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp),%zmm1 + + + + + vshufi64x2 $255,%zmm5,%zmm5,%zmm2 + addb $16,%r15b + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + + + + + + + + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm6 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + + + + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm15,%zmm10,%zmm26 + vpxorq %zmm12,%zmm6,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + + + + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + + + + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1) + vpshufb %zmm29,%zmm17,%zmm0 + vpshufb %zmm29,%zmm19,%zmm3 + vpshufb %zmm29,%zmm20,%zmm4 + vpshufb %zmm29,%zmm21,%zmm5 + vmovdqa64 %zmm0,1280(%rsp) + vmovdqa64 %zmm3,1344(%rsp) + vmovdqa64 %zmm4,1408(%rsp) + vmovdqa64 %zmm5,1472(%rsp) + vmovdqa64 1024(%rsp),%zmm13 + vmovdqu64 256(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1088(%rsp),%zmm13 + vmovdqu64 320(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 1152(%rsp),%zmm13 + vmovdqu64 384(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1216(%rsp),%zmm13 + vmovdqu64 448(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + subq $256,%r8 + addq $256,%r11 + movl %r8d,%r10d + addl $15,%r10d + shrl $4,%r10d + je .L_last_num_blocks_is_0_GGlifssooGvFomC + + cmpl $8,%r10d + je .L_last_num_blocks_is_8_GGlifssooGvFomC + jb .L_last_num_blocks_is_7_1_GGlifssooGvFomC + + + cmpl $12,%r10d + je .L_last_num_blocks_is_12_GGlifssooGvFomC + jb .L_last_num_blocks_is_11_9_GGlifssooGvFomC + + + cmpl $15,%r10d + je .L_last_num_blocks_is_15_GGlifssooGvFomC + ja .L_last_num_blocks_is_16_GGlifssooGvFomC + cmpl $14,%r10d + je .L_last_num_blocks_is_14_GGlifssooGvFomC + jmp .L_last_num_blocks_is_13_GGlifssooGvFomC + +.L_last_num_blocks_is_11_9_GGlifssooGvFomC: + + cmpl $10,%r10d + je .L_last_num_blocks_is_10_GGlifssooGvFomC + ja .L_last_num_blocks_is_11_GGlifssooGvFomC + jmp .L_last_num_blocks_is_9_GGlifssooGvFomC + +.L_last_num_blocks_is_7_1_GGlifssooGvFomC: + cmpl $4,%r10d + je .L_last_num_blocks_is_4_GGlifssooGvFomC + jb .L_last_num_blocks_is_3_1_GGlifssooGvFomC + + cmpl $6,%r10d + ja .L_last_num_blocks_is_7_GGlifssooGvFomC + je .L_last_num_blocks_is_6_GGlifssooGvFomC + jmp .L_last_num_blocks_is_5_GGlifssooGvFomC + +.L_last_num_blocks_is_3_1_GGlifssooGvFomC: + + cmpl $2,%r10d + ja .L_last_num_blocks_is_3_GGlifssooGvFomC + je .L_last_num_blocks_is_2_GGlifssooGvFomC +.L_last_num_blocks_is_1_GGlifssooGvFomC: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $255,%r15d + jae .L_16_blocks_overflow_DFdkfCEpyEuzGts + vpaddd %xmm28,%xmm2,%xmm0 + jmp .L_16_blocks_ok_DFdkfCEpyEuzGts + +.L_16_blocks_overflow_DFdkfCEpyEuzGts: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %xmm29,%xmm0,%xmm0 +.L_16_blocks_ok_DFdkfCEpyEuzGts: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $0,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z} + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %xmm31,%xmm0,%xmm0 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %xmm30,%xmm0,%xmm0 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti32x4 $0,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %xmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %xmm29,%xmm17,%xmm17 + vextracti32x4 $0,%zmm17,%xmm7 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_znzDmxCrzeqhmtt + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_znzDmxCrzeqhmtt +.L_small_initial_partial_block_znzDmxCrzeqhmtt: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + + + + + + + + + + + + vpxorq %xmm7,%xmm14,%xmm14 + + jmp .L_after_reduction_znzDmxCrzeqhmtt +.L_small_initial_compute_done_znzDmxCrzeqhmtt: +.L_after_reduction_znzDmxCrzeqhmtt: + jmp .L_last_blocks_done_GGlifssooGvFomC +.L_last_num_blocks_is_2_GGlifssooGvFomC: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $254,%r15d + jae .L_16_blocks_overflow_fxAkfvCdnqqGArm + vpaddd %ymm28,%ymm2,%ymm0 + jmp .L_16_blocks_ok_fxAkfvCdnqqGArm + +.L_16_blocks_overflow_fxAkfvCdnqqGArm: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %ymm29,%ymm0,%ymm0 +.L_16_blocks_ok_fxAkfvCdnqqGArm: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $1,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z} + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %ymm31,%ymm0,%ymm0 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %ymm30,%ymm0,%ymm0 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %ymm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %ymm29,%ymm17,%ymm17 + vextracti32x4 $1,%zmm17,%xmm7 + subq $16 * (2 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_kgAaABygmxmrDhD + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_kgAaABygmxmrDhD +.L_small_initial_partial_block_kgAaABygmxmrDhD: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_kgAaABygmxmrDhD: + + orq %r8,%r8 + je .L_after_reduction_kgAaABygmxmrDhD + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_kgAaABygmxmrDhD: + jmp .L_last_blocks_done_GGlifssooGvFomC +.L_last_num_blocks_is_3_GGlifssooGvFomC: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $253,%r15d + jae .L_16_blocks_overflow_DnqopufcDlfooBF + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_DnqopufcDlfooBF + +.L_16_blocks_overflow_DnqopufcDlfooBF: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_DnqopufcDlfooBF: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $2,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $2,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vextracti32x4 $2,%zmm17,%xmm7 + subq $16 * (3 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_qgbxmvAdpcwjFGD + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_qgbxmvAdpcwjFGD +.L_small_initial_partial_block_qgbxmvAdpcwjFGD: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_qgbxmvAdpcwjFGD: + + orq %r8,%r8 + je .L_after_reduction_qgbxmvAdpcwjFGD + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_qgbxmvAdpcwjFGD: + jmp .L_last_blocks_done_GGlifssooGvFomC +.L_last_num_blocks_is_4_GGlifssooGvFomC: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $252,%r15d + jae .L_16_blocks_overflow_zzorvqhpvdBckcq + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_zzorvqhpvdBckcq + +.L_16_blocks_overflow_zzorvqhpvdBckcq: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_zzorvqhpvdBckcq: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $3,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $3,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vextracti32x4 $3,%zmm17,%xmm7 + subq $16 * (4 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_giCxqwgmxrChxdc + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_giCxqwgmxrChxdc +.L_small_initial_partial_block_giCxqwgmxrChxdc: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_giCxqwgmxrChxdc: + + orq %r8,%r8 + je .L_after_reduction_giCxqwgmxrChxdc + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_giCxqwgmxrChxdc: + jmp .L_last_blocks_done_GGlifssooGvFomC +.L_last_num_blocks_is_5_GGlifssooGvFomC: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $251,%r15d + jae .L_16_blocks_overflow_qzjnvgqjjxsfmEr + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %xmm27,%xmm0,%xmm3 + jmp .L_16_blocks_ok_qzjnvgqjjxsfmEr + +.L_16_blocks_overflow_qzjnvgqjjxsfmEr: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %xmm29,%xmm3,%xmm3 +.L_16_blocks_ok_qzjnvgqjjxsfmEr: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $0,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %xmm30,%xmm3,%xmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %xmm19,%xmm3,%xmm3 + vextracti32x4 $0,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %xmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %xmm29,%xmm19,%xmm19 + vextracti32x4 $0,%zmm19,%xmm7 + subq $16 * (5 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_xoEftvygjvpovck + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_xoEftvygjvpovck +.L_small_initial_partial_block_xoEftvygjvpovck: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_xoEftvygjvpovck: + + orq %r8,%r8 + je .L_after_reduction_xoEftvygjvpovck + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_xoEftvygjvpovck: + jmp .L_last_blocks_done_GGlifssooGvFomC +.L_last_num_blocks_is_6_GGlifssooGvFomC: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $250,%r15d + jae .L_16_blocks_overflow_mvFwizCezuedAbr + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %ymm27,%ymm0,%ymm3 + jmp .L_16_blocks_ok_mvFwizCezuedAbr + +.L_16_blocks_overflow_mvFwizCezuedAbr: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %ymm29,%ymm3,%ymm3 +.L_16_blocks_ok_mvFwizCezuedAbr: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $1,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %ymm30,%ymm3,%ymm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %ymm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %ymm29,%ymm19,%ymm19 + vextracti32x4 $1,%zmm19,%xmm7 + subq $16 * (6 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_FDuhyDmhetmzsvq + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_FDuhyDmhetmzsvq +.L_small_initial_partial_block_FDuhyDmhetmzsvq: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_FDuhyDmhetmzsvq: + + orq %r8,%r8 + je .L_after_reduction_FDuhyDmhetmzsvq + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_FDuhyDmhetmzsvq: + jmp .L_last_blocks_done_GGlifssooGvFomC +.L_last_num_blocks_is_7_GGlifssooGvFomC: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $249,%r15d + jae .L_16_blocks_overflow_owtBaGpzgzgcxrC + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_owtBaGpzgzgcxrC + +.L_16_blocks_overflow_owtBaGpzgzgcxrC: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_owtBaGpzgzgcxrC: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $2,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $2,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vextracti32x4 $2,%zmm19,%xmm7 + subq $16 * (7 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_DncaxytjCyxiknt + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_DncaxytjCyxiknt +.L_small_initial_partial_block_DncaxytjCyxiknt: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_DncaxytjCyxiknt: + + orq %r8,%r8 + je .L_after_reduction_DncaxytjCyxiknt + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_DncaxytjCyxiknt: + jmp .L_last_blocks_done_GGlifssooGvFomC +.L_last_num_blocks_is_8_GGlifssooGvFomC: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $248,%r15d + jae .L_16_blocks_overflow_dAhdphrDhhiFfvd + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_dAhdphrDhhiFfvd + +.L_16_blocks_overflow_dAhdphrDhhiFfvd: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_dAhdphrDhhiFfvd: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $3,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $3,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vextracti32x4 $3,%zmm19,%xmm7 + subq $16 * (8 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_CnEvizjBlzFFnif + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_CnEvizjBlzFFnif +.L_small_initial_partial_block_CnEvizjBlzFFnif: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_CnEvizjBlzFFnif: + + orq %r8,%r8 + je .L_after_reduction_CnEvizjBlzFFnif + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_CnEvizjBlzFFnif: + jmp .L_last_blocks_done_GGlifssooGvFomC +.L_last_num_blocks_is_9_GGlifssooGvFomC: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $247,%r15d + jae .L_16_blocks_overflow_eaicByEvunpebxo + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %xmm27,%xmm3,%xmm4 + jmp .L_16_blocks_ok_eaicByEvunpebxo + +.L_16_blocks_overflow_eaicByEvunpebxo: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %xmm29,%xmm4,%xmm4 +.L_16_blocks_ok_eaicByEvunpebxo: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $0,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %xmm30,%xmm4,%xmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %xmm20,%xmm4,%xmm4 + vextracti32x4 $0,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %xmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %xmm29,%xmm20,%xmm20 + vextracti32x4 $0,%zmm20,%xmm7 + subq $16 * (9 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_gfgCplcDGBrovbz + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_gfgCplcDGBrovbz +.L_small_initial_partial_block_gfgCplcDGBrovbz: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_gfgCplcDGBrovbz: + + orq %r8,%r8 + je .L_after_reduction_gfgCplcDGBrovbz + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_gfgCplcDGBrovbz: + jmp .L_last_blocks_done_GGlifssooGvFomC +.L_last_num_blocks_is_10_GGlifssooGvFomC: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $246,%r15d + jae .L_16_blocks_overflow_bfFejorcehrytqq + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %ymm27,%ymm3,%ymm4 + jmp .L_16_blocks_ok_bfFejorcehrytqq + +.L_16_blocks_overflow_bfFejorcehrytqq: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %ymm29,%ymm4,%ymm4 +.L_16_blocks_ok_bfFejorcehrytqq: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $1,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %ymm30,%ymm4,%ymm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %ymm20,%ymm4,%ymm4 + vextracti32x4 $1,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %ymm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %ymm29,%ymm20,%ymm20 + vextracti32x4 $1,%zmm20,%xmm7 + subq $16 * (10 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_ebiAndfrelejgeD + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_ebiAndfrelejgeD +.L_small_initial_partial_block_ebiAndfrelejgeD: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_ebiAndfrelejgeD: + + orq %r8,%r8 + je .L_after_reduction_ebiAndfrelejgeD + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_ebiAndfrelejgeD: + jmp .L_last_blocks_done_GGlifssooGvFomC +.L_last_num_blocks_is_11_GGlifssooGvFomC: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $245,%r15d + jae .L_16_blocks_overflow_nsakvpcBnizduGq + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_nsakvpcBnizduGq + +.L_16_blocks_overflow_nsakvpcBnizduGq: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_nsakvpcBnizduGq: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $2,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $2,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vextracti32x4 $2,%zmm20,%xmm7 + subq $16 * (11 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_FeAoudrbheqBGiy + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_FeAoudrbheqBGiy +.L_small_initial_partial_block_FeAoudrbheqBGiy: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_FeAoudrbheqBGiy: + + orq %r8,%r8 + je .L_after_reduction_FeAoudrbheqBGiy + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_FeAoudrbheqBGiy: + jmp .L_last_blocks_done_GGlifssooGvFomC +.L_last_num_blocks_is_12_GGlifssooGvFomC: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $244,%r15d + jae .L_16_blocks_overflow_bwFzciofFgjcilw + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_bwFzciofFgjcilw + +.L_16_blocks_overflow_bwFzciofFgjcilw: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_bwFzciofFgjcilw: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $3,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $3,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vextracti32x4 $3,%zmm20,%xmm7 + subq $16 * (12 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_cfkroClFdpzvhum + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_cfkroClFdpzvhum +.L_small_initial_partial_block_cfkroClFdpzvhum: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_cfkroClFdpzvhum: + + orq %r8,%r8 + je .L_after_reduction_cfkroClFdpzvhum + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_cfkroClFdpzvhum: + jmp .L_last_blocks_done_GGlifssooGvFomC +.L_last_num_blocks_is_13_GGlifssooGvFomC: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $243,%r15d + jae .L_16_blocks_overflow_wabAfqhkitemmDb + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %xmm27,%xmm4,%xmm5 + jmp .L_16_blocks_ok_wabAfqhkitemmDb + +.L_16_blocks_overflow_wabAfqhkitemmDb: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %xmm29,%xmm5,%xmm5 +.L_16_blocks_ok_wabAfqhkitemmDb: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $0,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %xmm30,%xmm5,%xmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %xmm21,%xmm5,%xmm5 + vextracti32x4 $0,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %xmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %xmm29,%xmm21,%xmm21 + vextracti32x4 $0,%zmm21,%xmm7 + subq $16 * (13 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_sdmohCiFjxvtkha + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_sdmohCiFjxvtkha +.L_small_initial_partial_block_sdmohCiFjxvtkha: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_sdmohCiFjxvtkha: + + orq %r8,%r8 + je .L_after_reduction_sdmohCiFjxvtkha + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_sdmohCiFjxvtkha: + jmp .L_last_blocks_done_GGlifssooGvFomC +.L_last_num_blocks_is_14_GGlifssooGvFomC: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $242,%r15d + jae .L_16_blocks_overflow_xpqoqezlFcomfjA + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %ymm27,%ymm4,%ymm5 + jmp .L_16_blocks_ok_xpqoqezlFcomfjA + +.L_16_blocks_overflow_xpqoqezlFcomfjA: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %ymm29,%ymm5,%ymm5 +.L_16_blocks_ok_xpqoqezlFcomfjA: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $1,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %ymm30,%ymm5,%ymm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %ymm21,%ymm5,%ymm5 + vextracti32x4 $1,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %ymm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %ymm29,%ymm21,%ymm21 + vextracti32x4 $1,%zmm21,%xmm7 + subq $16 * (14 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_fexjdoDflollEzw + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_fexjdoDflollEzw +.L_small_initial_partial_block_fexjdoDflollEzw: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_fexjdoDflollEzw: + + orq %r8,%r8 + je .L_after_reduction_fexjdoDflollEzw + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_fexjdoDflollEzw: + jmp .L_last_blocks_done_GGlifssooGvFomC +.L_last_num_blocks_is_15_GGlifssooGvFomC: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $241,%r15d + jae .L_16_blocks_overflow_iupvxgCFjryaArw + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_iupvxgCFjryaArw + +.L_16_blocks_overflow_iupvxgCFjryaArw: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_iupvxgCFjryaArw: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $2,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $2,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %zmm29,%zmm21,%zmm21 + vextracti32x4 $2,%zmm21,%xmm7 + subq $16 * (15 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_lxborjzgtwFghrg + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_lxborjzgtwFghrg +.L_small_initial_partial_block_lxborjzgtwFghrg: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_lxborjzgtwFghrg: + + orq %r8,%r8 + je .L_after_reduction_lxborjzgtwFghrg + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_lxborjzgtwFghrg: + jmp .L_last_blocks_done_GGlifssooGvFomC +.L_last_num_blocks_is_16_GGlifssooGvFomC: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $240,%r15d + jae .L_16_blocks_overflow_moDvkAftCFCxmvo + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_moDvkAftCFCxmvo + +.L_16_blocks_overflow_moDvkAftCFCxmvo: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_moDvkAftCFCxmvo: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $3,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $3,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %zmm29,%zmm21,%zmm21 + vextracti32x4 $3,%zmm21,%xmm7 + subq $16 * (16 - 1),%r8 +.L_small_initial_partial_block_xrrskpkhizncrkw: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_xrrskpkhizncrkw: + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_xrrskpkhizncrkw: + jmp .L_last_blocks_done_GGlifssooGvFomC +.L_last_num_blocks_is_0_GGlifssooGvFomC: + vmovdqa64 1280(%rsp),%zmm13 + vmovdqu64 512(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1344(%rsp),%zmm13 + vmovdqu64 576(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 1408(%rsp),%zmm13 + vmovdqu64 640(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1472(%rsp),%zmm13 + vmovdqu64 704(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm4 + + + vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0 + vpslldq $8,%xmm0,%xmm0 + vpxorq %xmm0,%xmm25,%xmm0 + + + vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3 + vpsrldq $4,%xmm3,%xmm3 + vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm3,%xmm14 + +.L_last_blocks_done_GGlifssooGvFomC: + vpshufb %xmm29,%xmm2,%xmm2 + jmp .L_ghash_done_yiifChpfBbxhAhe + +.L_message_below_32_blocks_yiifChpfBbxhAhe: + + + subq $256,%r8 + addq $256,%r11 + movl %r8d,%r10d + testq %r14,%r14 + jnz .L_skip_hkeys_precomputation_ixpbnbdqqmnximo + vmovdqu64 640(%rsp),%zmm3 + + + vshufi64x2 $0x00,%zmm3,%zmm3,%zmm3 + + vmovdqu64 576(%rsp),%zmm4 + vmovdqu64 512(%rsp),%zmm5 + + vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4 + vpxorq %zmm10,%zmm4,%zmm4 + + vpsrldq $8,%zmm4,%zmm10 + vpslldq $8,%zmm4,%zmm4 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4 + vpslldq $4,%zmm4,%zmm4 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm4 + + vmovdqu64 %zmm4,448(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5 + vpxorq %zmm10,%zmm5,%zmm5 + + vpsrldq $8,%zmm5,%zmm10 + vpslldq $8,%zmm5,%zmm5 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5 + vpslldq $4,%zmm5,%zmm5 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm5 + + vmovdqu64 %zmm5,384(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4 + vpxorq %zmm10,%zmm4,%zmm4 + + vpsrldq $8,%zmm4,%zmm10 + vpslldq $8,%zmm4,%zmm4 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4 + vpslldq $4,%zmm4,%zmm4 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm4 + + vmovdqu64 %zmm4,320(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5 + vpxorq %zmm10,%zmm5,%zmm5 + + vpsrldq $8,%zmm5,%zmm10 + vpslldq $8,%zmm5,%zmm5 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5 + vpslldq $4,%zmm5,%zmm5 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm5 + + vmovdqu64 %zmm5,256(%rsp) +.L_skip_hkeys_precomputation_ixpbnbdqqmnximo: + movq $1,%r14 + andl $~15,%r10d + movl $512,%ebx + subl %r10d,%ebx + movl %r8d,%r10d + addl $15,%r10d + shrl $4,%r10d + je .L_last_num_blocks_is_0_GEDbrBwahgCtBua + + cmpl $8,%r10d + je .L_last_num_blocks_is_8_GEDbrBwahgCtBua + jb .L_last_num_blocks_is_7_1_GEDbrBwahgCtBua + + + cmpl $12,%r10d + je .L_last_num_blocks_is_12_GEDbrBwahgCtBua + jb .L_last_num_blocks_is_11_9_GEDbrBwahgCtBua + + + cmpl $15,%r10d + je .L_last_num_blocks_is_15_GEDbrBwahgCtBua + ja .L_last_num_blocks_is_16_GEDbrBwahgCtBua + cmpl $14,%r10d + je .L_last_num_blocks_is_14_GEDbrBwahgCtBua + jmp .L_last_num_blocks_is_13_GEDbrBwahgCtBua + +.L_last_num_blocks_is_11_9_GEDbrBwahgCtBua: + + cmpl $10,%r10d + je .L_last_num_blocks_is_10_GEDbrBwahgCtBua + ja .L_last_num_blocks_is_11_GEDbrBwahgCtBua + jmp .L_last_num_blocks_is_9_GEDbrBwahgCtBua + +.L_last_num_blocks_is_7_1_GEDbrBwahgCtBua: + cmpl $4,%r10d + je .L_last_num_blocks_is_4_GEDbrBwahgCtBua + jb .L_last_num_blocks_is_3_1_GEDbrBwahgCtBua + + cmpl $6,%r10d + ja .L_last_num_blocks_is_7_GEDbrBwahgCtBua + je .L_last_num_blocks_is_6_GEDbrBwahgCtBua + jmp .L_last_num_blocks_is_5_GEDbrBwahgCtBua + +.L_last_num_blocks_is_3_1_GEDbrBwahgCtBua: + + cmpl $2,%r10d + ja .L_last_num_blocks_is_3_GEDbrBwahgCtBua + je .L_last_num_blocks_is_2_GEDbrBwahgCtBua +.L_last_num_blocks_is_1_GEDbrBwahgCtBua: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $255,%r15d + jae .L_16_blocks_overflow_uopvqADFnvomDpc + vpaddd %xmm28,%xmm2,%xmm0 + jmp .L_16_blocks_ok_uopvqADFnvomDpc + +.L_16_blocks_overflow_uopvqADFnvomDpc: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %xmm29,%xmm0,%xmm0 +.L_16_blocks_ok_uopvqADFnvomDpc: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z} + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vaesenclast %xmm30,%xmm0,%xmm0 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti32x4 $0,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %xmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %xmm29,%xmm17,%xmm17 + vextracti32x4 $0,%zmm17,%xmm7 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_DnfzexoyiBDakur + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_DnfzexoyiBDakur +.L_small_initial_partial_block_DnfzexoyiBDakur: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm0 + + + vpclmulqdq $0x01,%xmm25,%xmm0,%xmm3 + vpslldq $8,%xmm3,%xmm3 + vpxorq %xmm3,%xmm25,%xmm3 + + + vpclmulqdq $0x00,%xmm3,%xmm0,%xmm4 + vpsrldq $4,%xmm4,%xmm4 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm4,%xmm14 + + + + + + + + + + + + + vpxorq %xmm7,%xmm14,%xmm14 + + jmp .L_after_reduction_DnfzexoyiBDakur +.L_small_initial_compute_done_DnfzexoyiBDakur: +.L_after_reduction_DnfzexoyiBDakur: + jmp .L_last_blocks_done_GEDbrBwahgCtBua +.L_last_num_blocks_is_2_GEDbrBwahgCtBua: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $254,%r15d + jae .L_16_blocks_overflow_frftcwjeGlwitcu + vpaddd %ymm28,%ymm2,%ymm0 + jmp .L_16_blocks_ok_frftcwjeGlwitcu + +.L_16_blocks_overflow_frftcwjeGlwitcu: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %ymm29,%ymm0,%ymm0 +.L_16_blocks_ok_frftcwjeGlwitcu: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z} + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vaesenclast %ymm30,%ymm0,%ymm0 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %ymm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %ymm29,%ymm17,%ymm17 + vextracti32x4 $1,%zmm17,%xmm7 + subq $16 * (2 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_ebldtywbExmpuki + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_ebldtywbExmpuki +.L_small_initial_partial_block_ebldtywbExmpuki: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_ebldtywbExmpuki: + + orq %r8,%r8 + je .L_after_reduction_ebldtywbExmpuki + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_ebldtywbExmpuki: + jmp .L_last_blocks_done_GEDbrBwahgCtBua +.L_last_num_blocks_is_3_GEDbrBwahgCtBua: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $253,%r15d + jae .L_16_blocks_overflow_hAiudycBxwjzccs + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_hAiudycBxwjzccs + +.L_16_blocks_overflow_hAiudycBxwjzccs: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_hAiudycBxwjzccs: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $2,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vextracti32x4 $2,%zmm17,%xmm7 + subq $16 * (3 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_gkjuFBcoGtpvwjC + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_gkjuFBcoGtpvwjC +.L_small_initial_partial_block_gkjuFBcoGtpvwjC: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_gkjuFBcoGtpvwjC: + + orq %r8,%r8 + je .L_after_reduction_gkjuFBcoGtpvwjC + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_gkjuFBcoGtpvwjC: + jmp .L_last_blocks_done_GEDbrBwahgCtBua +.L_last_num_blocks_is_4_GEDbrBwahgCtBua: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $252,%r15d + jae .L_16_blocks_overflow_oahqGxwjdGuFmgl + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_oahqGxwjdGuFmgl + +.L_16_blocks_overflow_oahqGxwjdGuFmgl: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_oahqGxwjdGuFmgl: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $3,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vextracti32x4 $3,%zmm17,%xmm7 + subq $16 * (4 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_eiywasarDnqsmGr + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_eiywasarDnqsmGr +.L_small_initial_partial_block_eiywasarDnqsmGr: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_eiywasarDnqsmGr: + + orq %r8,%r8 + je .L_after_reduction_eiywasarDnqsmGr + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_eiywasarDnqsmGr: + jmp .L_last_blocks_done_GEDbrBwahgCtBua +.L_last_num_blocks_is_5_GEDbrBwahgCtBua: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $251,%r15d + jae .L_16_blocks_overflow_hnCCvmCdnDGyqwm + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %xmm27,%xmm0,%xmm3 + jmp .L_16_blocks_ok_hnCCvmCdnDGyqwm + +.L_16_blocks_overflow_hnCCvmCdnDGyqwm: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %xmm29,%xmm3,%xmm3 +.L_16_blocks_ok_hnCCvmCdnDGyqwm: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %xmm30,%xmm3,%xmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %xmm19,%xmm3,%xmm3 + vextracti32x4 $0,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %xmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %xmm29,%xmm19,%xmm19 + vextracti32x4 $0,%zmm19,%xmm7 + subq $16 * (5 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_ClsDvmjDyaivejA + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_ClsDvmjDyaivejA +.L_small_initial_partial_block_ClsDvmjDyaivejA: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_ClsDvmjDyaivejA: + + orq %r8,%r8 + je .L_after_reduction_ClsDvmjDyaivejA + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_ClsDvmjDyaivejA: + jmp .L_last_blocks_done_GEDbrBwahgCtBua +.L_last_num_blocks_is_6_GEDbrBwahgCtBua: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $250,%r15d + jae .L_16_blocks_overflow_wuftgpncuosGzzy + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %ymm27,%ymm0,%ymm3 + jmp .L_16_blocks_ok_wuftgpncuosGzzy + +.L_16_blocks_overflow_wuftgpncuosGzzy: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %ymm29,%ymm3,%ymm3 +.L_16_blocks_ok_wuftgpncuosGzzy: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %ymm30,%ymm3,%ymm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %ymm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %ymm29,%ymm19,%ymm19 + vextracti32x4 $1,%zmm19,%xmm7 + subq $16 * (6 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_zFcpqFaCfaxEfGi + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_zFcpqFaCfaxEfGi +.L_small_initial_partial_block_zFcpqFaCfaxEfGi: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_zFcpqFaCfaxEfGi: + + orq %r8,%r8 + je .L_after_reduction_zFcpqFaCfaxEfGi + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_zFcpqFaCfaxEfGi: + jmp .L_last_blocks_done_GEDbrBwahgCtBua +.L_last_num_blocks_is_7_GEDbrBwahgCtBua: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $249,%r15d + jae .L_16_blocks_overflow_klwFEoGBGuBizdw + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_klwFEoGBGuBizdw + +.L_16_blocks_overflow_klwFEoGBGuBizdw: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_klwFEoGBGuBizdw: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $2,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vextracti32x4 $2,%zmm19,%xmm7 + subq $16 * (7 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_xbzdhFqEauEAyBq + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_xbzdhFqEauEAyBq +.L_small_initial_partial_block_xbzdhFqEauEAyBq: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_xbzdhFqEauEAyBq: + + orq %r8,%r8 + je .L_after_reduction_xbzdhFqEauEAyBq + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_xbzdhFqEauEAyBq: + jmp .L_last_blocks_done_GEDbrBwahgCtBua +.L_last_num_blocks_is_8_GEDbrBwahgCtBua: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $248,%r15d + jae .L_16_blocks_overflow_jAucrepCBmxevpC + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_jAucrepCBmxevpC + +.L_16_blocks_overflow_jAucrepCBmxevpC: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_jAucrepCBmxevpC: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $3,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vextracti32x4 $3,%zmm19,%xmm7 + subq $16 * (8 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_xBnzffrFrcfhxcA + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_xBnzffrFrcfhxcA +.L_small_initial_partial_block_xBnzffrFrcfhxcA: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_xBnzffrFrcfhxcA: + + orq %r8,%r8 + je .L_after_reduction_xBnzffrFrcfhxcA + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_xBnzffrFrcfhxcA: + jmp .L_last_blocks_done_GEDbrBwahgCtBua +.L_last_num_blocks_is_9_GEDbrBwahgCtBua: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $247,%r15d + jae .L_16_blocks_overflow_lnAxGywxkpnspqj + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %xmm27,%xmm3,%xmm4 + jmp .L_16_blocks_ok_lnAxGywxkpnspqj + +.L_16_blocks_overflow_lnAxGywxkpnspqj: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %xmm29,%xmm4,%xmm4 +.L_16_blocks_ok_lnAxGywxkpnspqj: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %xmm30,%xmm4,%xmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %xmm20,%xmm4,%xmm4 + vextracti32x4 $0,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %xmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %xmm29,%xmm20,%xmm20 + vextracti32x4 $0,%zmm20,%xmm7 + subq $16 * (9 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_AFvqyugwjoGBwEa + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_AFvqyugwjoGBwEa +.L_small_initial_partial_block_AFvqyugwjoGBwEa: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_AFvqyugwjoGBwEa: + + orq %r8,%r8 + je .L_after_reduction_AFvqyugwjoGBwEa + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_AFvqyugwjoGBwEa: + jmp .L_last_blocks_done_GEDbrBwahgCtBua +.L_last_num_blocks_is_10_GEDbrBwahgCtBua: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $246,%r15d + jae .L_16_blocks_overflow_ffDgumCtogFyFDv + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %ymm27,%ymm3,%ymm4 + jmp .L_16_blocks_ok_ffDgumCtogFyFDv + +.L_16_blocks_overflow_ffDgumCtogFyFDv: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %ymm29,%ymm4,%ymm4 +.L_16_blocks_ok_ffDgumCtogFyFDv: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %ymm30,%ymm4,%ymm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %ymm20,%ymm4,%ymm4 + vextracti32x4 $1,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %ymm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %ymm29,%ymm20,%ymm20 + vextracti32x4 $1,%zmm20,%xmm7 + subq $16 * (10 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_erArFgBvhusaEfz + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_erArFgBvhusaEfz +.L_small_initial_partial_block_erArFgBvhusaEfz: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_erArFgBvhusaEfz: + + orq %r8,%r8 + je .L_after_reduction_erArFgBvhusaEfz + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_erArFgBvhusaEfz: + jmp .L_last_blocks_done_GEDbrBwahgCtBua +.L_last_num_blocks_is_11_GEDbrBwahgCtBua: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $245,%r15d + jae .L_16_blocks_overflow_bFwwBhxumkFGgCj + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_bFwwBhxumkFGgCj + +.L_16_blocks_overflow_bFwwBhxumkFGgCj: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_bFwwBhxumkFGgCj: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $2,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vextracti32x4 $2,%zmm20,%xmm7 + subq $16 * (11 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_GsrdkhxzEjDjspu + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_GsrdkhxzEjDjspu +.L_small_initial_partial_block_GsrdkhxzEjDjspu: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_GsrdkhxzEjDjspu: + + orq %r8,%r8 + je .L_after_reduction_GsrdkhxzEjDjspu + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_GsrdkhxzEjDjspu: + jmp .L_last_blocks_done_GEDbrBwahgCtBua +.L_last_num_blocks_is_12_GEDbrBwahgCtBua: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $244,%r15d + jae .L_16_blocks_overflow_EhylpkcoptuvDCF + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_EhylpkcoptuvDCF + +.L_16_blocks_overflow_EhylpkcoptuvDCF: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_EhylpkcoptuvDCF: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $3,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vextracti32x4 $3,%zmm20,%xmm7 + subq $16 * (12 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_rxjldaleyvljAtn + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_rxjldaleyvljAtn +.L_small_initial_partial_block_rxjldaleyvljAtn: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_rxjldaleyvljAtn: + + orq %r8,%r8 + je .L_after_reduction_rxjldaleyvljAtn + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_rxjldaleyvljAtn: + jmp .L_last_blocks_done_GEDbrBwahgCtBua +.L_last_num_blocks_is_13_GEDbrBwahgCtBua: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $243,%r15d + jae .L_16_blocks_overflow_fbDDAjuqhDzbgcz + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %xmm27,%xmm4,%xmm5 + jmp .L_16_blocks_ok_fbDDAjuqhDzbgcz + +.L_16_blocks_overflow_fbDDAjuqhDzbgcz: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %xmm29,%xmm5,%xmm5 +.L_16_blocks_ok_fbDDAjuqhDzbgcz: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %xmm30,%xmm5,%xmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %xmm21,%xmm5,%xmm5 + vextracti32x4 $0,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %xmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %xmm29,%xmm21,%xmm21 + vextracti32x4 $0,%zmm21,%xmm7 + subq $16 * (13 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_rvBgbcAEiGvppxE + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_rvBgbcAEiGvppxE +.L_small_initial_partial_block_rvBgbcAEiGvppxE: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_rvBgbcAEiGvppxE: + + orq %r8,%r8 + je .L_after_reduction_rvBgbcAEiGvppxE + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_rvBgbcAEiGvppxE: + jmp .L_last_blocks_done_GEDbrBwahgCtBua +.L_last_num_blocks_is_14_GEDbrBwahgCtBua: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $242,%r15d + jae .L_16_blocks_overflow_gqnBxnvCCiecpBb + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %ymm27,%ymm4,%ymm5 + jmp .L_16_blocks_ok_gqnBxnvCCiecpBb + +.L_16_blocks_overflow_gqnBxnvCCiecpBb: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %ymm29,%ymm5,%ymm5 +.L_16_blocks_ok_gqnBxnvCCiecpBb: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %ymm30,%ymm5,%ymm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %ymm21,%ymm5,%ymm5 + vextracti32x4 $1,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %ymm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %ymm29,%ymm21,%ymm21 + vextracti32x4 $1,%zmm21,%xmm7 + subq $16 * (14 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_eqvhEpqoCboGBGs + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_eqvhEpqoCboGBGs +.L_small_initial_partial_block_eqvhEpqoCboGBGs: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_eqvhEpqoCboGBGs: + + orq %r8,%r8 + je .L_after_reduction_eqvhEpqoCboGBGs + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_eqvhEpqoCboGBGs: + jmp .L_last_blocks_done_GEDbrBwahgCtBua +.L_last_num_blocks_is_15_GEDbrBwahgCtBua: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $241,%r15d + jae .L_16_blocks_overflow_dnxqlgAbmkEzAAl + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_dnxqlgAbmkEzAAl + +.L_16_blocks_overflow_dnxqlgAbmkEzAAl: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_dnxqlgAbmkEzAAl: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $2,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %zmm29,%zmm21,%zmm21 + vextracti32x4 $2,%zmm21,%xmm7 + subq $16 * (15 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_vubecvzrvvmvkjn + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_vubecvzrvvmvkjn +.L_small_initial_partial_block_vubecvzrvvmvkjn: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_vubecvzrvvmvkjn: + + orq %r8,%r8 + je .L_after_reduction_vubecvzrvvmvkjn + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_vubecvzrvvmvkjn: + jmp .L_last_blocks_done_GEDbrBwahgCtBua +.L_last_num_blocks_is_16_GEDbrBwahgCtBua: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $240,%r15d + jae .L_16_blocks_overflow_CvkndtfiFrebkyC + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_CvkndtfiFrebkyC + +.L_16_blocks_overflow_CvkndtfiFrebkyC: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_CvkndtfiFrebkyC: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $3,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %zmm29,%zmm21,%zmm21 + vextracti32x4 $3,%zmm21,%xmm7 + subq $16 * (16 - 1),%r8 +.L_small_initial_partial_block_lvDgrdjdyCeaixF: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_lvDgrdjdyCeaixF: + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_lvDgrdjdyCeaixF: + jmp .L_last_blocks_done_GEDbrBwahgCtBua +.L_last_num_blocks_is_0_GEDbrBwahgCtBua: + vmovdqa64 768(%rsp),%zmm13 + vpxorq %zmm14,%zmm13,%zmm13 + vmovdqu64 0(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 832(%rsp),%zmm13 + vmovdqu64 64(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpxorq %zmm10,%zmm4,%zmm26 + vpxorq %zmm6,%zmm0,%zmm24 + vpxorq %zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 896(%rsp),%zmm13 + vmovdqu64 128(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 960(%rsp),%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm4 + + + vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0 + vpslldq $8,%xmm0,%xmm0 + vpxorq %xmm0,%xmm25,%xmm0 + + + vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3 + vpsrldq $4,%xmm3,%xmm3 + vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm3,%xmm14 + +.L_last_blocks_done_GEDbrBwahgCtBua: + vpshufb %xmm29,%xmm2,%xmm2 + jmp .L_ghash_done_yiifChpfBbxhAhe + +.L_message_below_equal_16_blocks_yiifChpfBbxhAhe: + + + movl %r8d,%r12d + addl $15,%r12d + shrl $4,%r12d + cmpq $8,%r12 + je .L_small_initial_num_blocks_is_8_mplqBbEupjaGmpE + jl .L_small_initial_num_blocks_is_7_1_mplqBbEupjaGmpE + + + cmpq $12,%r12 + je .L_small_initial_num_blocks_is_12_mplqBbEupjaGmpE + jl .L_small_initial_num_blocks_is_11_9_mplqBbEupjaGmpE + + + cmpq $16,%r12 + je .L_small_initial_num_blocks_is_16_mplqBbEupjaGmpE + cmpq $15,%r12 + je .L_small_initial_num_blocks_is_15_mplqBbEupjaGmpE + cmpq $14,%r12 + je .L_small_initial_num_blocks_is_14_mplqBbEupjaGmpE + jmp .L_small_initial_num_blocks_is_13_mplqBbEupjaGmpE + +.L_small_initial_num_blocks_is_11_9_mplqBbEupjaGmpE: + + cmpq $11,%r12 + je .L_small_initial_num_blocks_is_11_mplqBbEupjaGmpE + cmpq $10,%r12 + je .L_small_initial_num_blocks_is_10_mplqBbEupjaGmpE + jmp .L_small_initial_num_blocks_is_9_mplqBbEupjaGmpE + +.L_small_initial_num_blocks_is_7_1_mplqBbEupjaGmpE: + cmpq $4,%r12 + je .L_small_initial_num_blocks_is_4_mplqBbEupjaGmpE + jl .L_small_initial_num_blocks_is_3_1_mplqBbEupjaGmpE + + cmpq $7,%r12 + je .L_small_initial_num_blocks_is_7_mplqBbEupjaGmpE + cmpq $6,%r12 + je .L_small_initial_num_blocks_is_6_mplqBbEupjaGmpE + jmp .L_small_initial_num_blocks_is_5_mplqBbEupjaGmpE + +.L_small_initial_num_blocks_is_3_1_mplqBbEupjaGmpE: + + cmpq $3,%r12 + je .L_small_initial_num_blocks_is_3_mplqBbEupjaGmpE + cmpq $2,%r12 + je .L_small_initial_num_blocks_is_2_mplqBbEupjaGmpE + + + + + +.L_small_initial_num_blocks_is_1_mplqBbEupjaGmpE: + vmovdqa64 SHUF_MASK(%rip),%xmm29 + vpaddd ONE(%rip),%xmm2,%xmm0 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $0,%zmm0,%xmm2 + vpshufb %xmm29,%xmm0,%xmm0 + vmovdqu8 0(%rcx,%r11,1),%xmm6{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenclast %xmm15,%xmm0,%xmm0 + vpxorq %xmm6,%xmm0,%xmm0 + vextracti32x4 $0,%zmm0,%xmm12 + movq %r9,%r10 + vmovdqu8 %xmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %xmm29,%xmm6,%xmm6 + vextracti32x4 $0,%zmm6,%xmm13 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_nsFdAskshxaeupv + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm6,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm6,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm6,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm6,%xmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_nsFdAskshxaeupv +.L_small_initial_partial_block_nsFdAskshxaeupv: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + + + + + + + + + + + + vpxorq %xmm13,%xmm14,%xmm14 + + jmp .L_after_reduction_nsFdAskshxaeupv +.L_small_initial_compute_done_nsFdAskshxaeupv: +.L_after_reduction_nsFdAskshxaeupv: + jmp .L_small_initial_blocks_encrypted_mplqBbEupjaGmpE +.L_small_initial_num_blocks_is_2_mplqBbEupjaGmpE: + vmovdqa64 SHUF_MASK(%rip),%ymm29 + vshufi64x2 $0,%ymm2,%ymm2,%ymm0 + vpaddd ddq_add_1234(%rip),%ymm0,%ymm0 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $1,%zmm0,%xmm2 + vpshufb %ymm29,%ymm0,%ymm0 + vmovdqu8 0(%rcx,%r11,1),%ymm6{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenclast %ymm15,%ymm0,%ymm0 + vpxorq %ymm6,%ymm0,%ymm0 + vextracti32x4 $1,%zmm0,%xmm12 + movq %r9,%r10 + vmovdqu8 %ymm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %ymm29,%ymm6,%ymm6 + vextracti32x4 $1,%zmm6,%xmm13 + subq $16 * (2 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_fCBepgtpwtinebu + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm6,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm6,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm6,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm6,%ymm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_fCBepgtpwtinebu +.L_small_initial_partial_block_fCBepgtpwtinebu: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm6,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm6,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm6,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm6,%xmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_fCBepgtpwtinebu: + + orq %r8,%r8 + je .L_after_reduction_fCBepgtpwtinebu + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_fCBepgtpwtinebu: + jmp .L_small_initial_blocks_encrypted_mplqBbEupjaGmpE +.L_small_initial_num_blocks_is_3_mplqBbEupjaGmpE: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $2,%zmm0,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vmovdqu8 0(%rcx,%r11,1),%zmm6{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vpxorq %zmm6,%zmm0,%zmm0 + vextracti32x4 $2,%zmm0,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vextracti32x4 $2,%zmm6,%xmm13 + subq $16 * (3 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_ofgdrgACzgoBoBr + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_ofgdrgACzgoBoBr +.L_small_initial_partial_block_ofgdrgACzgoBoBr: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm6,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm6,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm6,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm6,%ymm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_ofgdrgACzgoBoBr: + + orq %r8,%r8 + je .L_after_reduction_ofgdrgACzgoBoBr + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_ofgdrgACzgoBoBr: + jmp .L_small_initial_blocks_encrypted_mplqBbEupjaGmpE +.L_small_initial_num_blocks_is_4_mplqBbEupjaGmpE: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $3,%zmm0,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vmovdqu8 0(%rcx,%r11,1),%zmm6{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vpxorq %zmm6,%zmm0,%zmm0 + vextracti32x4 $3,%zmm0,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vextracti32x4 $3,%zmm6,%xmm13 + subq $16 * (4 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_dEtigFagnjrsGpg + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 288(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + + vpxorq %zmm19,%zmm17,%zmm17 + vpsrldq $8,%zmm17,%zmm4 + vpslldq $8,%zmm17,%zmm5 + vpxorq %zmm4,%zmm15,%zmm0 + vpxorq %zmm5,%zmm16,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_dEtigFagnjrsGpg +.L_small_initial_partial_block_dEtigFagnjrsGpg: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_dEtigFagnjrsGpg: + + orq %r8,%r8 + je .L_after_reduction_dEtigFagnjrsGpg + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_dEtigFagnjrsGpg: + jmp .L_small_initial_blocks_encrypted_mplqBbEupjaGmpE +.L_small_initial_num_blocks_is_5_mplqBbEupjaGmpE: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $64,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $0,%zmm3,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %xmm29,%xmm3,%xmm3 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%xmm7{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %xmm15,%xmm3,%xmm3 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %xmm7,%xmm3,%xmm3 + vextracti32x4 $0,%zmm3,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %xmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vpshufb %xmm29,%xmm7,%xmm7 + vextracti32x4 $0,%zmm7,%xmm13 + subq $16 * (5 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_dCteGnCoiDfemGr + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 272(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm7,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm7,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm7,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm7,%xmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_dCteGnCoiDfemGr +.L_small_initial_partial_block_dCteGnCoiDfemGr: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 288(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + + vpxorq %zmm19,%zmm17,%zmm17 + vpsrldq $8,%zmm17,%zmm4 + vpslldq $8,%zmm17,%zmm5 + vpxorq %zmm4,%zmm15,%zmm0 + vpxorq %zmm5,%zmm16,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_dCteGnCoiDfemGr: + + orq %r8,%r8 + je .L_after_reduction_dCteGnCoiDfemGr + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_dCteGnCoiDfemGr: + jmp .L_small_initial_blocks_encrypted_mplqBbEupjaGmpE +.L_small_initial_num_blocks_is_6_mplqBbEupjaGmpE: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $64,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $1,%zmm3,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %ymm29,%ymm3,%ymm3 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%ymm7{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %ymm15,%ymm3,%ymm3 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %ymm7,%ymm3,%ymm3 + vextracti32x4 $1,%zmm3,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %ymm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vpshufb %ymm29,%ymm7,%ymm7 + vextracti32x4 $1,%zmm7,%xmm13 + subq $16 * (6 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_bGkgeCcdmBAvnkd + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 256(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm7,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm7,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm7,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm7,%ymm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_bGkgeCcdmBAvnkd +.L_small_initial_partial_block_bGkgeCcdmBAvnkd: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 272(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm7,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm7,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm7,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm7,%xmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_bGkgeCcdmBAvnkd: + + orq %r8,%r8 + je .L_after_reduction_bGkgeCcdmBAvnkd + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_bGkgeCcdmBAvnkd: + jmp .L_small_initial_blocks_encrypted_mplqBbEupjaGmpE +.L_small_initial_num_blocks_is_7_mplqBbEupjaGmpE: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $64,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $2,%zmm3,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vextracti32x4 $2,%zmm3,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vpshufb %zmm29,%zmm7,%zmm7 + vextracti32x4 $2,%zmm7,%xmm13 + subq $16 * (7 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_yFpypBfpEqGmDpc + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 240(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_yFpypBfpEqGmDpc +.L_small_initial_partial_block_yFpypBfpEqGmDpc: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 256(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm7,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm7,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm7,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm7,%ymm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_yFpypBfpEqGmDpc: + + orq %r8,%r8 + je .L_after_reduction_yFpypBfpEqGmDpc + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_yFpypBfpEqGmDpc: + jmp .L_small_initial_blocks_encrypted_mplqBbEupjaGmpE +.L_small_initial_num_blocks_is_8_mplqBbEupjaGmpE: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $64,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $3,%zmm3,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vextracti32x4 $3,%zmm3,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vpshufb %zmm29,%zmm7,%zmm7 + vextracti32x4 $3,%zmm7,%xmm13 + subq $16 * (8 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_hjijhggGtBGkmFD + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 224(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 288(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + + vpxorq %zmm19,%zmm17,%zmm17 + vpsrldq $8,%zmm17,%zmm4 + vpslldq $8,%zmm17,%zmm5 + vpxorq %zmm4,%zmm15,%zmm0 + vpxorq %zmm5,%zmm16,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_hjijhggGtBGkmFD +.L_small_initial_partial_block_hjijhggGtBGkmFD: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 240(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_hjijhggGtBGkmFD: + + orq %r8,%r8 + je .L_after_reduction_hjijhggGtBGkmFD + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_hjijhggGtBGkmFD: + jmp .L_small_initial_blocks_encrypted_mplqBbEupjaGmpE +.L_small_initial_num_blocks_is_9_mplqBbEupjaGmpE: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $128,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $0,%zmm4,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %xmm29,%xmm4,%xmm4 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%xmm10{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %xmm15,%xmm4,%xmm4 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %xmm10,%xmm4,%xmm4 + vextracti32x4 $0,%zmm4,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %xmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %xmm29,%xmm10,%xmm10 + vextracti32x4 $0,%zmm10,%xmm13 + subq $16 * (9 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_rEnEygbAhbwkuDv + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 208(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 272(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm10,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm10,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm10,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm10,%xmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_rEnEygbAhbwkuDv +.L_small_initial_partial_block_rEnEygbAhbwkuDv: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 224(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 288(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + + vpxorq %zmm19,%zmm17,%zmm17 + vpsrldq $8,%zmm17,%zmm4 + vpslldq $8,%zmm17,%zmm5 + vpxorq %zmm4,%zmm15,%zmm0 + vpxorq %zmm5,%zmm16,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_rEnEygbAhbwkuDv: + + orq %r8,%r8 + je .L_after_reduction_rEnEygbAhbwkuDv + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_rEnEygbAhbwkuDv: + jmp .L_small_initial_blocks_encrypted_mplqBbEupjaGmpE +.L_small_initial_num_blocks_is_10_mplqBbEupjaGmpE: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $128,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $1,%zmm4,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %ymm29,%ymm4,%ymm4 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%ymm10{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %ymm15,%ymm4,%ymm4 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %ymm10,%ymm4,%ymm4 + vextracti32x4 $1,%zmm4,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %ymm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %ymm29,%ymm10,%ymm10 + vextracti32x4 $1,%zmm10,%xmm13 + subq $16 * (10 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_ycofttvCgGxDvfA + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 192(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 256(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm10,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm10,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm10,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm10,%ymm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_ycofttvCgGxDvfA +.L_small_initial_partial_block_ycofttvCgGxDvfA: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 208(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 272(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm10,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm10,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm10,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm10,%xmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_ycofttvCgGxDvfA: + + orq %r8,%r8 + je .L_after_reduction_ycofttvCgGxDvfA + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_ycofttvCgGxDvfA: + jmp .L_small_initial_blocks_encrypted_mplqBbEupjaGmpE +.L_small_initial_num_blocks_is_11_mplqBbEupjaGmpE: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $128,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $2,%zmm4,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%zmm10{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %zmm15,%zmm4,%zmm4 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %zmm10,%zmm4,%zmm4 + vextracti32x4 $2,%zmm4,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %zmm29,%zmm10,%zmm10 + vextracti32x4 $2,%zmm10,%xmm13 + subq $16 * (11 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_ltkvxnnCtyaDcot + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 176(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 240(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_ltkvxnnCtyaDcot +.L_small_initial_partial_block_ltkvxnnCtyaDcot: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 192(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 256(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm10,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm10,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm10,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm10,%ymm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_ltkvxnnCtyaDcot: + + orq %r8,%r8 + je .L_after_reduction_ltkvxnnCtyaDcot + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_ltkvxnnCtyaDcot: + jmp .L_small_initial_blocks_encrypted_mplqBbEupjaGmpE +.L_small_initial_num_blocks_is_12_mplqBbEupjaGmpE: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $128,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $3,%zmm4,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%zmm10{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %zmm15,%zmm4,%zmm4 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %zmm10,%zmm4,%zmm4 + vextracti32x4 $3,%zmm4,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %zmm29,%zmm10,%zmm10 + vextracti32x4 $3,%zmm10,%xmm13 + subq $16 * (12 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_zoBxutsDfgEkfdl + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 160(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 224(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 288(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + + vpxorq %zmm19,%zmm17,%zmm17 + vpsrldq $8,%zmm17,%zmm4 + vpslldq $8,%zmm17,%zmm5 + vpxorq %zmm4,%zmm15,%zmm0 + vpxorq %zmm5,%zmm16,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_zoBxutsDfgEkfdl +.L_small_initial_partial_block_zoBxutsDfgEkfdl: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 176(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 240(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_zoBxutsDfgEkfdl: + + orq %r8,%r8 + je .L_after_reduction_zoBxutsDfgEkfdl + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_zoBxutsDfgEkfdl: + jmp .L_small_initial_blocks_encrypted_mplqBbEupjaGmpE +.L_small_initial_num_blocks_is_13_mplqBbEupjaGmpE: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + vpaddd ddq_add_8888(%rip),%zmm3,%zmm5 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $192,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $0,%zmm5,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %xmm29,%xmm5,%xmm5 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%zmm10 + vmovdqu8 192(%rcx,%r11,1),%xmm11{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %zmm15,%zmm4,%zmm4 + vpxorq %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %zmm15,%zmm4,%zmm4 + vaesenclast %xmm15,%xmm5,%xmm5 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %zmm10,%zmm4,%zmm4 + vpxorq %xmm11,%xmm5,%xmm5 + vextracti32x4 $0,%zmm5,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %xmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %zmm29,%zmm10,%zmm10 + vpshufb %xmm29,%xmm11,%xmm11 + vextracti32x4 $0,%zmm11,%xmm13 + subq $16 * (13 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_fgsEocrdhfxmzmp + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 144(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 208(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 272(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm11,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm11,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm11,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm11,%xmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_fgsEocrdhfxmzmp +.L_small_initial_partial_block_fgsEocrdhfxmzmp: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 160(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 224(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 288(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + + vpxorq %zmm19,%zmm17,%zmm17 + vpsrldq $8,%zmm17,%zmm4 + vpslldq $8,%zmm17,%zmm5 + vpxorq %zmm4,%zmm15,%zmm0 + vpxorq %zmm5,%zmm16,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_fgsEocrdhfxmzmp: + + orq %r8,%r8 + je .L_after_reduction_fgsEocrdhfxmzmp + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_fgsEocrdhfxmzmp: + jmp .L_small_initial_blocks_encrypted_mplqBbEupjaGmpE +.L_small_initial_num_blocks_is_14_mplqBbEupjaGmpE: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + vpaddd ddq_add_8888(%rip),%zmm3,%zmm5 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $192,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $1,%zmm5,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %ymm29,%ymm5,%ymm5 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%zmm10 + vmovdqu8 192(%rcx,%r11,1),%ymm11{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %zmm15,%zmm4,%zmm4 + vpxorq %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %zmm15,%zmm4,%zmm4 + vaesenclast %ymm15,%ymm5,%ymm5 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %zmm10,%zmm4,%zmm4 + vpxorq %ymm11,%ymm5,%ymm5 + vextracti32x4 $1,%zmm5,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %ymm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %zmm29,%zmm10,%zmm10 + vpshufb %ymm29,%ymm11,%ymm11 + vextracti32x4 $1,%zmm11,%xmm13 + subq $16 * (14 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_aBllprqbyydDmyj + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 128(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 192(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 256(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm11,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm11,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm11,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm11,%ymm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_aBllprqbyydDmyj +.L_small_initial_partial_block_aBllprqbyydDmyj: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 144(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 208(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 272(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm11,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm11,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm11,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm11,%xmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_aBllprqbyydDmyj: + + orq %r8,%r8 + je .L_after_reduction_aBllprqbyydDmyj + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_aBllprqbyydDmyj: + jmp .L_small_initial_blocks_encrypted_mplqBbEupjaGmpE +.L_small_initial_num_blocks_is_15_mplqBbEupjaGmpE: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + vpaddd ddq_add_8888(%rip),%zmm3,%zmm5 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $192,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $2,%zmm5,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%zmm10 + vmovdqu8 192(%rcx,%r11,1),%zmm11{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %zmm15,%zmm4,%zmm4 + vpxorq %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %zmm15,%zmm4,%zmm4 + vaesenclast %zmm15,%zmm5,%zmm5 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %zmm10,%zmm4,%zmm4 + vpxorq %zmm11,%zmm5,%zmm5 + vextracti32x4 $2,%zmm5,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %zmm29,%zmm10,%zmm10 + vpshufb %zmm29,%zmm11,%zmm11 + vextracti32x4 $2,%zmm11,%xmm13 + subq $16 * (15 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_AexewybgiAbCusw + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 112(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 176(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 240(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm11,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm11,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm11,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm11,%zmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_AexewybgiAbCusw +.L_small_initial_partial_block_AexewybgiAbCusw: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 128(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 192(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 256(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm11,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm11,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm11,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm11,%ymm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_AexewybgiAbCusw: + + orq %r8,%r8 + je .L_after_reduction_AexewybgiAbCusw + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_AexewybgiAbCusw: + jmp .L_small_initial_blocks_encrypted_mplqBbEupjaGmpE +.L_small_initial_num_blocks_is_16_mplqBbEupjaGmpE: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + vpaddd ddq_add_8888(%rip),%zmm3,%zmm5 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $192,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $3,%zmm5,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%zmm10 + vmovdqu8 192(%rcx,%r11,1),%zmm11{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %zmm15,%zmm4,%zmm4 + vpxorq %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %zmm15,%zmm4,%zmm4 + vaesenclast %zmm15,%zmm5,%zmm5 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %zmm10,%zmm4,%zmm4 + vpxorq %zmm11,%zmm5,%zmm5 + vextracti32x4 $3,%zmm5,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %zmm29,%zmm10,%zmm10 + vpshufb %zmm29,%zmm11,%zmm11 + vextracti32x4 $3,%zmm11,%xmm13 + subq $16 * (16 - 1),%r8 +.L_small_initial_partial_block_wjciopnfEgwwghE: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 112(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 176(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 240(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm11,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm11,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm11,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm11,%zmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_wjciopnfEgwwghE: + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_wjciopnfEgwwghE: +.L_small_initial_blocks_encrypted_mplqBbEupjaGmpE: +.L_ghash_done_yiifChpfBbxhAhe: + vmovdqu64 %xmm2,0(%rsi) + vmovdqu64 %xmm14,64(%rsi) +.L_enc_dec_done_yiifChpfBbxhAhe: + jmp .Lexit_gcm_decrypt +.align 32 +.Laes_gcm_decrypt_256_avx512: + orq %r8,%r8 + je .L_enc_dec_done_kgypzeldFqsBnqw + xorq %r14,%r14 + vmovdqu64 64(%rsi),%xmm14 + + movq (%rdx),%r11 + orq %r11,%r11 + je .L_partial_block_done_nggFpEjksmvdyrl + movl $16,%r10d + leaq byte_len_to_mask_table(%rip),%r12 + cmpq %r10,%r8 + cmovcq %r8,%r10 + kmovw (%r12,%r10,2),%k1 + vmovdqu8 (%rcx),%xmm0{%k1}{z} + + vmovdqu64 16(%rsi),%xmm3 + vmovdqu64 336(%rsi),%xmm4 + + + + leaq SHIFT_MASK(%rip),%r12 + addq %r11,%r12 + vmovdqu64 (%r12),%xmm5 + vpshufb %xmm5,%xmm3,%xmm3 + + vmovdqa64 %xmm0,%xmm6 + vpxorq %xmm0,%xmm3,%xmm3 + + + leaq (%r8,%r11,1),%r13 + subq $16,%r13 + jge .L_no_extra_mask_nggFpEjksmvdyrl + subq %r13,%r12 +.L_no_extra_mask_nggFpEjksmvdyrl: + + + + vmovdqu64 16(%r12),%xmm0 + vpand %xmm0,%xmm3,%xmm3 + vpand %xmm0,%xmm6,%xmm6 + vpshufb SHUF_MASK(%rip),%xmm6,%xmm6 + vpshufb %xmm5,%xmm6,%xmm6 + vpxorq %xmm6,%xmm14,%xmm14 + cmpq $0,%r13 + jl .L_partial_incomplete_nggFpEjksmvdyrl + + vpclmulqdq $0x11,%xmm4,%xmm14,%xmm7 + vpclmulqdq $0x00,%xmm4,%xmm14,%xmm10 + vpclmulqdq $0x01,%xmm4,%xmm14,%xmm11 + vpclmulqdq $0x10,%xmm4,%xmm14,%xmm14 + vpxorq %xmm11,%xmm14,%xmm14 + + vpsrldq $8,%xmm14,%xmm11 + vpslldq $8,%xmm14,%xmm14 + vpxorq %xmm11,%xmm7,%xmm7 + vpxorq %xmm10,%xmm14,%xmm14 + + + + vmovdqu64 POLY2(%rip),%xmm11 + + vpclmulqdq $0x01,%xmm14,%xmm11,%xmm10 + vpslldq $8,%xmm10,%xmm10 + vpxorq %xmm10,%xmm14,%xmm14 + + + + vpclmulqdq $0x00,%xmm14,%xmm11,%xmm10 + vpsrldq $4,%xmm10,%xmm10 + vpclmulqdq $0x10,%xmm14,%xmm11,%xmm14 + vpslldq $4,%xmm14,%xmm14 + + vpternlogq $0x96,%xmm10,%xmm7,%xmm14 + + movq $0,(%rdx) + + movq %r11,%r12 + movq $16,%r11 + subq %r12,%r11 + jmp .L_enc_dec_done_nggFpEjksmvdyrl + +.L_partial_incomplete_nggFpEjksmvdyrl: + addq %r8,(%rdx) + movq %r8,%r11 + +.L_enc_dec_done_nggFpEjksmvdyrl: + + + leaq byte_len_to_mask_table(%rip),%r12 + kmovw (%r12,%r11,2),%k1 + vmovdqu64 %xmm14,64(%rsi) + movq %r9,%r12 + vmovdqu8 %xmm3,(%r12){%k1} +.L_partial_block_done_nggFpEjksmvdyrl: + vmovdqu64 0(%rsi),%xmm2 + subq %r11,%r8 + je .L_enc_dec_done_kgypzeldFqsBnqw + cmpq $256,%r8 + jbe .L_message_below_equal_16_blocks_kgypzeldFqsBnqw + + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vmovdqa64 ddq_addbe_4444(%rip),%zmm27 + vmovdqa64 ddq_addbe_1234(%rip),%zmm28 + + + + + + + vmovd %xmm2,%r15d + andl $255,%r15d + + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpshufb %zmm29,%zmm2,%zmm2 + + + + cmpb $240,%r15b + jae .L_next_16_overflow_tAigrohrtcimtjt + vpaddd %zmm28,%zmm2,%zmm7 + vpaddd %zmm27,%zmm7,%zmm10 + vpaddd %zmm27,%zmm10,%zmm11 + vpaddd %zmm27,%zmm11,%zmm12 + jmp .L_next_16_ok_tAigrohrtcimtjt +.L_next_16_overflow_tAigrohrtcimtjt: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm12 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm7 + vpaddd %zmm12,%zmm7,%zmm10 + vpaddd %zmm12,%zmm10,%zmm11 + vpaddd %zmm12,%zmm11,%zmm12 + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %zmm29,%zmm10,%zmm10 + vpshufb %zmm29,%zmm11,%zmm11 + vpshufb %zmm29,%zmm12,%zmm12 +.L_next_16_ok_tAigrohrtcimtjt: + vshufi64x2 $255,%zmm12,%zmm12,%zmm2 + addb $16,%r15b + + vmovdqu8 0(%rcx,%r11,1),%zmm0 + vmovdqu8 64(%rcx,%r11,1),%zmm3 + vmovdqu8 128(%rcx,%r11,1),%zmm4 + vmovdqu8 192(%rcx,%r11,1),%zmm5 + + + vbroadcastf64x2 0(%rdi),%zmm6 + vpxorq %zmm6,%zmm7,%zmm7 + vpxorq %zmm6,%zmm10,%zmm10 + vpxorq %zmm6,%zmm11,%zmm11 + vpxorq %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 16(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 32(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 48(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 64(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 80(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 96(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 112(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 128(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 144(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 160(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 176(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 192(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 208(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 224(%rdi),%zmm6 + vaesenclast %zmm6,%zmm7,%zmm7 + vaesenclast %zmm6,%zmm10,%zmm10 + vaesenclast %zmm6,%zmm11,%zmm11 + vaesenclast %zmm6,%zmm12,%zmm12 + + + vpxorq %zmm0,%zmm7,%zmm7 + vpxorq %zmm3,%zmm10,%zmm10 + vpxorq %zmm4,%zmm11,%zmm11 + vpxorq %zmm5,%zmm12,%zmm12 + + + movq %r9,%r10 + vmovdqu8 %zmm7,0(%r10,%r11,1) + vmovdqu8 %zmm10,64(%r10,%r11,1) + vmovdqu8 %zmm11,128(%r10,%r11,1) + vmovdqu8 %zmm12,192(%r10,%r11,1) + + vpshufb %zmm29,%zmm0,%zmm7 + vpshufb %zmm29,%zmm3,%zmm10 + vpshufb %zmm29,%zmm4,%zmm11 + vpshufb %zmm29,%zmm5,%zmm12 + vmovdqa64 %zmm7,768(%rsp) + vmovdqa64 %zmm10,832(%rsp) + vmovdqa64 %zmm11,896(%rsp) + vmovdqa64 %zmm12,960(%rsp) + testq %r14,%r14 + jnz .L_skip_hkeys_precomputation_ghxCyjhEqsFobgk + + vmovdqu64 288(%rsi),%zmm0 + vmovdqu64 %zmm0,704(%rsp) + + vmovdqu64 224(%rsi),%zmm3 + vmovdqu64 %zmm3,640(%rsp) + + + vshufi64x2 $0x00,%zmm3,%zmm3,%zmm3 + + vmovdqu64 160(%rsi),%zmm4 + vmovdqu64 %zmm4,576(%rsp) + + vmovdqu64 96(%rsi),%zmm5 + vmovdqu64 %zmm5,512(%rsp) +.L_skip_hkeys_precomputation_ghxCyjhEqsFobgk: + cmpq $512,%r8 + jb .L_message_below_32_blocks_kgypzeldFqsBnqw + + + + cmpb $240,%r15b + jae .L_next_16_overflow_ChqoygvwrfptFdk + vpaddd %zmm28,%zmm2,%zmm7 + vpaddd %zmm27,%zmm7,%zmm10 + vpaddd %zmm27,%zmm10,%zmm11 + vpaddd %zmm27,%zmm11,%zmm12 + jmp .L_next_16_ok_ChqoygvwrfptFdk +.L_next_16_overflow_ChqoygvwrfptFdk: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm12 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm7 + vpaddd %zmm12,%zmm7,%zmm10 + vpaddd %zmm12,%zmm10,%zmm11 + vpaddd %zmm12,%zmm11,%zmm12 + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %zmm29,%zmm10,%zmm10 + vpshufb %zmm29,%zmm11,%zmm11 + vpshufb %zmm29,%zmm12,%zmm12 +.L_next_16_ok_ChqoygvwrfptFdk: + vshufi64x2 $255,%zmm12,%zmm12,%zmm2 + addb $16,%r15b + + vmovdqu8 256(%rcx,%r11,1),%zmm0 + vmovdqu8 320(%rcx,%r11,1),%zmm3 + vmovdqu8 384(%rcx,%r11,1),%zmm4 + vmovdqu8 448(%rcx,%r11,1),%zmm5 + + + vbroadcastf64x2 0(%rdi),%zmm6 + vpxorq %zmm6,%zmm7,%zmm7 + vpxorq %zmm6,%zmm10,%zmm10 + vpxorq %zmm6,%zmm11,%zmm11 + vpxorq %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 16(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 32(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 48(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 64(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 80(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 96(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 112(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 128(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 144(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 160(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 176(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 192(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 208(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 224(%rdi),%zmm6 + vaesenclast %zmm6,%zmm7,%zmm7 + vaesenclast %zmm6,%zmm10,%zmm10 + vaesenclast %zmm6,%zmm11,%zmm11 + vaesenclast %zmm6,%zmm12,%zmm12 + + + vpxorq %zmm0,%zmm7,%zmm7 + vpxorq %zmm3,%zmm10,%zmm10 + vpxorq %zmm4,%zmm11,%zmm11 + vpxorq %zmm5,%zmm12,%zmm12 + + + movq %r9,%r10 + vmovdqu8 %zmm7,256(%r10,%r11,1) + vmovdqu8 %zmm10,320(%r10,%r11,1) + vmovdqu8 %zmm11,384(%r10,%r11,1) + vmovdqu8 %zmm12,448(%r10,%r11,1) + + vpshufb %zmm29,%zmm0,%zmm7 + vpshufb %zmm29,%zmm3,%zmm10 + vpshufb %zmm29,%zmm4,%zmm11 + vpshufb %zmm29,%zmm5,%zmm12 + vmovdqa64 %zmm7,1024(%rsp) + vmovdqa64 %zmm10,1088(%rsp) + vmovdqa64 %zmm11,1152(%rsp) + vmovdqa64 %zmm12,1216(%rsp) + testq %r14,%r14 + jnz .L_skip_hkeys_precomputation_mmnytfEfrGqjjzv + vmovdqu64 640(%rsp),%zmm3 + + + vshufi64x2 $0x00,%zmm3,%zmm3,%zmm3 + + vmovdqu64 576(%rsp),%zmm4 + vmovdqu64 512(%rsp),%zmm5 + + vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4 + vpxorq %zmm10,%zmm4,%zmm4 + + vpsrldq $8,%zmm4,%zmm10 + vpslldq $8,%zmm4,%zmm4 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4 + vpslldq $4,%zmm4,%zmm4 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm4 + + vmovdqu64 %zmm4,448(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5 + vpxorq %zmm10,%zmm5,%zmm5 + + vpsrldq $8,%zmm5,%zmm10 + vpslldq $8,%zmm5,%zmm5 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5 + vpslldq $4,%zmm5,%zmm5 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm5 + + vmovdqu64 %zmm5,384(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4 + vpxorq %zmm10,%zmm4,%zmm4 + + vpsrldq $8,%zmm4,%zmm10 + vpslldq $8,%zmm4,%zmm4 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4 + vpslldq $4,%zmm4,%zmm4 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm4 + + vmovdqu64 %zmm4,320(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5 + vpxorq %zmm10,%zmm5,%zmm5 + + vpsrldq $8,%zmm5,%zmm10 + vpslldq $8,%zmm5,%zmm5 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5 + vpslldq $4,%zmm5,%zmm5 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm5 + + vmovdqu64 %zmm5,256(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4 + vpxorq %zmm10,%zmm4,%zmm4 + + vpsrldq $8,%zmm4,%zmm10 + vpslldq $8,%zmm4,%zmm4 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4 + vpslldq $4,%zmm4,%zmm4 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm4 + + vmovdqu64 %zmm4,192(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5 + vpxorq %zmm10,%zmm5,%zmm5 + + vpsrldq $8,%zmm5,%zmm10 + vpslldq $8,%zmm5,%zmm5 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5 + vpslldq $4,%zmm5,%zmm5 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm5 + + vmovdqu64 %zmm5,128(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4 + vpxorq %zmm10,%zmm4,%zmm4 + + vpsrldq $8,%zmm4,%zmm10 + vpslldq $8,%zmm4,%zmm4 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4 + vpslldq $4,%zmm4,%zmm4 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm4 + + vmovdqu64 %zmm4,64(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5 + vpxorq %zmm10,%zmm5,%zmm5 + + vpsrldq $8,%zmm5,%zmm10 + vpslldq $8,%zmm5,%zmm5 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5 + vpslldq $4,%zmm5,%zmm5 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm5 + + vmovdqu64 %zmm5,0(%rsp) +.L_skip_hkeys_precomputation_mmnytfEfrGqjjzv: + movq $1,%r14 + addq $512,%r11 + subq $512,%r8 + + cmpq $768,%r8 + jb .L_no_more_big_nblocks_kgypzeldFqsBnqw +.L_encrypt_big_nblocks_kgypzeldFqsBnqw: + cmpb $240,%r15b + jae .L_16_blocks_overflow_eCBAbsCxcdjldmp + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_eCBAbsCxcdjldmp +.L_16_blocks_overflow_eCBAbsCxcdjldmp: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_eCBAbsCxcdjldmp: + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp),%zmm1 + + + + + vshufi64x2 $255,%zmm5,%zmm5,%zmm2 + addb $16,%r15b + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + + + + + + + + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm6 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + + + + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm15,%zmm10,%zmm26 + vpxorq %zmm12,%zmm6,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 208(%rdi),%zmm31 + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 224(%rdi),%zmm30 + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + + + + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + + + + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1) + vpshufb %zmm29,%zmm17,%zmm0 + vpshufb %zmm29,%zmm19,%zmm3 + vpshufb %zmm29,%zmm20,%zmm4 + vpshufb %zmm29,%zmm21,%zmm5 + vmovdqa64 %zmm0,1280(%rsp) + vmovdqa64 %zmm3,1344(%rsp) + vmovdqa64 %zmm4,1408(%rsp) + vmovdqa64 %zmm5,1472(%rsp) + cmpb $240,%r15b + jae .L_16_blocks_overflow_vakicEdockyEGlr + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_vakicEdockyEGlr +.L_16_blocks_overflow_vakicEdockyEGlr: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_vakicEdockyEGlr: + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 256(%rsp),%zmm1 + + + + + vshufi64x2 $255,%zmm5,%zmm5,%zmm2 + addb $16,%r15b + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 320(%rsp),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + + + + + + + + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 384(%rsp),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 448(%rsp),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm6 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + + + + vmovdqu8 256(%rcx,%r11,1),%zmm17 + vmovdqu8 320(%rcx,%r11,1),%zmm19 + vmovdqu8 384(%rcx,%r11,1),%zmm20 + vmovdqu8 448(%rcx,%r11,1),%zmm21 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vpternlogq $0x96,%zmm12,%zmm6,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 208(%rdi),%zmm31 + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 224(%rdi),%zmm30 + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + + + + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + + + + movq %r9,%r10 + vmovdqu8 %zmm0,256(%r10,%r11,1) + vmovdqu8 %zmm3,320(%r10,%r11,1) + vmovdqu8 %zmm4,384(%r10,%r11,1) + vmovdqu8 %zmm5,448(%r10,%r11,1) + vpshufb %zmm29,%zmm17,%zmm0 + vpshufb %zmm29,%zmm19,%zmm3 + vpshufb %zmm29,%zmm20,%zmm4 + vpshufb %zmm29,%zmm21,%zmm5 + vmovdqa64 %zmm0,768(%rsp) + vmovdqa64 %zmm3,832(%rsp) + vmovdqa64 %zmm4,896(%rsp) + vmovdqa64 %zmm5,960(%rsp) + cmpb $240,%r15b + jae .L_16_blocks_overflow_DpGlguFoEuofxlo + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_DpGlguFoEuofxlo +.L_16_blocks_overflow_DpGlguFoEuofxlo: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_DpGlguFoEuofxlo: + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + + + + + vshufi64x2 $255,%zmm5,%zmm5,%zmm2 + addb $16,%r15b + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + + + + + + + + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm6 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + + + + vmovdqu8 512(%rcx,%r11,1),%zmm17 + vmovdqu8 576(%rcx,%r11,1),%zmm19 + vmovdqu8 640(%rcx,%r11,1),%zmm20 + vmovdqu8 704(%rcx,%r11,1),%zmm21 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + + + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vpternlogq $0x96,%zmm15,%zmm12,%zmm6 + vpxorq %zmm24,%zmm6,%zmm6 + vpternlogq $0x96,%zmm10,%zmm13,%zmm7 + vpxorq %zmm25,%zmm7,%zmm7 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vextracti64x4 $1,%zmm6,%ymm12 + vpxorq %ymm12,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm12 + vpxorq %xmm12,%xmm6,%xmm6 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 208(%rdi),%zmm31 + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 224(%rdi),%zmm30 + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm6 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + + + + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + + + + movq %r9,%r10 + vmovdqu8 %zmm0,512(%r10,%r11,1) + vmovdqu8 %zmm3,576(%r10,%r11,1) + vmovdqu8 %zmm4,640(%r10,%r11,1) + vmovdqu8 %zmm5,704(%r10,%r11,1) + vpshufb %zmm29,%zmm17,%zmm0 + vpshufb %zmm29,%zmm19,%zmm3 + vpshufb %zmm29,%zmm20,%zmm4 + vpshufb %zmm29,%zmm21,%zmm5 + vmovdqa64 %zmm0,1024(%rsp) + vmovdqa64 %zmm3,1088(%rsp) + vmovdqa64 %zmm4,1152(%rsp) + vmovdqa64 %zmm5,1216(%rsp) + vmovdqa64 %zmm6,%zmm14 + + addq $768,%r11 + subq $768,%r8 + cmpq $768,%r8 + jae .L_encrypt_big_nblocks_kgypzeldFqsBnqw + +.L_no_more_big_nblocks_kgypzeldFqsBnqw: + + cmpq $512,%r8 + jae .L_encrypt_32_blocks_kgypzeldFqsBnqw + + cmpq $256,%r8 + jae .L_encrypt_16_blocks_kgypzeldFqsBnqw +.L_encrypt_0_blocks_ghash_32_kgypzeldFqsBnqw: + movl %r8d,%r10d + andl $~15,%r10d + movl $256,%ebx + subl %r10d,%ebx + vmovdqa64 768(%rsp),%zmm13 + vpxorq %zmm14,%zmm13,%zmm13 + vmovdqu64 0(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 832(%rsp),%zmm13 + vmovdqu64 64(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpxorq %zmm10,%zmm4,%zmm26 + vpxorq %zmm6,%zmm0,%zmm24 + vpxorq %zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 896(%rsp),%zmm13 + vmovdqu64 128(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 960(%rsp),%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + addl $256,%ebx + movl %r8d,%r10d + addl $15,%r10d + shrl $4,%r10d + je .L_last_num_blocks_is_0_ClvEnqtsgcyzxra + + cmpl $8,%r10d + je .L_last_num_blocks_is_8_ClvEnqtsgcyzxra + jb .L_last_num_blocks_is_7_1_ClvEnqtsgcyzxra + + + cmpl $12,%r10d + je .L_last_num_blocks_is_12_ClvEnqtsgcyzxra + jb .L_last_num_blocks_is_11_9_ClvEnqtsgcyzxra + + + cmpl $15,%r10d + je .L_last_num_blocks_is_15_ClvEnqtsgcyzxra + ja .L_last_num_blocks_is_16_ClvEnqtsgcyzxra + cmpl $14,%r10d + je .L_last_num_blocks_is_14_ClvEnqtsgcyzxra + jmp .L_last_num_blocks_is_13_ClvEnqtsgcyzxra + +.L_last_num_blocks_is_11_9_ClvEnqtsgcyzxra: + + cmpl $10,%r10d + je .L_last_num_blocks_is_10_ClvEnqtsgcyzxra + ja .L_last_num_blocks_is_11_ClvEnqtsgcyzxra + jmp .L_last_num_blocks_is_9_ClvEnqtsgcyzxra + +.L_last_num_blocks_is_7_1_ClvEnqtsgcyzxra: + cmpl $4,%r10d + je .L_last_num_blocks_is_4_ClvEnqtsgcyzxra + jb .L_last_num_blocks_is_3_1_ClvEnqtsgcyzxra + + cmpl $6,%r10d + ja .L_last_num_blocks_is_7_ClvEnqtsgcyzxra + je .L_last_num_blocks_is_6_ClvEnqtsgcyzxra + jmp .L_last_num_blocks_is_5_ClvEnqtsgcyzxra + +.L_last_num_blocks_is_3_1_ClvEnqtsgcyzxra: + + cmpl $2,%r10d + ja .L_last_num_blocks_is_3_ClvEnqtsgcyzxra + je .L_last_num_blocks_is_2_ClvEnqtsgcyzxra +.L_last_num_blocks_is_1_ClvEnqtsgcyzxra: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $255,%r15d + jae .L_16_blocks_overflow_kfstzqbddCmrAgf + vpaddd %xmm28,%xmm2,%xmm0 + jmp .L_16_blocks_ok_kfstzqbddCmrAgf + +.L_16_blocks_overflow_kfstzqbddCmrAgf: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %xmm29,%xmm0,%xmm0 +.L_16_blocks_ok_kfstzqbddCmrAgf: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z} + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vaesenclast %xmm30,%xmm0,%xmm0 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti32x4 $0,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %xmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %xmm29,%xmm17,%xmm17 + vextracti32x4 $0,%zmm17,%xmm7 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_tzfDxgvlfbGFphv + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_tzfDxgvlfbGFphv +.L_small_initial_partial_block_tzfDxgvlfbGFphv: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm0 + + + vpclmulqdq $0x01,%xmm25,%xmm0,%xmm3 + vpslldq $8,%xmm3,%xmm3 + vpxorq %xmm3,%xmm25,%xmm3 + + + vpclmulqdq $0x00,%xmm3,%xmm0,%xmm4 + vpsrldq $4,%xmm4,%xmm4 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm4,%xmm14 + + + + + + + + + + + + + vpxorq %xmm7,%xmm14,%xmm14 + + jmp .L_after_reduction_tzfDxgvlfbGFphv +.L_small_initial_compute_done_tzfDxgvlfbGFphv: +.L_after_reduction_tzfDxgvlfbGFphv: + jmp .L_last_blocks_done_ClvEnqtsgcyzxra +.L_last_num_blocks_is_2_ClvEnqtsgcyzxra: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $254,%r15d + jae .L_16_blocks_overflow_rEDkqlsspBphEcE + vpaddd %ymm28,%ymm2,%ymm0 + jmp .L_16_blocks_ok_rEDkqlsspBphEcE + +.L_16_blocks_overflow_rEDkqlsspBphEcE: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %ymm29,%ymm0,%ymm0 +.L_16_blocks_ok_rEDkqlsspBphEcE: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z} + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vaesenclast %ymm30,%ymm0,%ymm0 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %ymm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %ymm29,%ymm17,%ymm17 + vextracti32x4 $1,%zmm17,%xmm7 + subq $16 * (2 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_ctfxgFaGttixvxc + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_ctfxgFaGttixvxc +.L_small_initial_partial_block_ctfxgFaGttixvxc: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_ctfxgFaGttixvxc: + + orq %r8,%r8 + je .L_after_reduction_ctfxgFaGttixvxc + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_ctfxgFaGttixvxc: + jmp .L_last_blocks_done_ClvEnqtsgcyzxra +.L_last_num_blocks_is_3_ClvEnqtsgcyzxra: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $253,%r15d + jae .L_16_blocks_overflow_ghEEltEpFsCnyoi + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_ghEEltEpFsCnyoi + +.L_16_blocks_overflow_ghEEltEpFsCnyoi: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_ghEEltEpFsCnyoi: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $2,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vextracti32x4 $2,%zmm17,%xmm7 + subq $16 * (3 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_pdGCGzyrnusufbk + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_pdGCGzyrnusufbk +.L_small_initial_partial_block_pdGCGzyrnusufbk: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_pdGCGzyrnusufbk: + + orq %r8,%r8 + je .L_after_reduction_pdGCGzyrnusufbk + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_pdGCGzyrnusufbk: + jmp .L_last_blocks_done_ClvEnqtsgcyzxra +.L_last_num_blocks_is_4_ClvEnqtsgcyzxra: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $252,%r15d + jae .L_16_blocks_overflow_vrGynyzBBkFtoug + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_vrGynyzBBkFtoug + +.L_16_blocks_overflow_vrGynyzBBkFtoug: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_vrGynyzBBkFtoug: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $3,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vextracti32x4 $3,%zmm17,%xmm7 + subq $16 * (4 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_vbpuzolxwysglov + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_vbpuzolxwysglov +.L_small_initial_partial_block_vbpuzolxwysglov: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_vbpuzolxwysglov: + + orq %r8,%r8 + je .L_after_reduction_vbpuzolxwysglov + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_vbpuzolxwysglov: + jmp .L_last_blocks_done_ClvEnqtsgcyzxra +.L_last_num_blocks_is_5_ClvEnqtsgcyzxra: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $251,%r15d + jae .L_16_blocks_overflow_kkiaoGfqlrecpbg + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %xmm27,%xmm0,%xmm3 + jmp .L_16_blocks_ok_kkiaoGfqlrecpbg + +.L_16_blocks_overflow_kkiaoGfqlrecpbg: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %xmm29,%xmm3,%xmm3 +.L_16_blocks_ok_kkiaoGfqlrecpbg: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %xmm30,%xmm3,%xmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %xmm19,%xmm3,%xmm3 + vextracti32x4 $0,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %xmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %xmm29,%xmm19,%xmm19 + vextracti32x4 $0,%zmm19,%xmm7 + subq $16 * (5 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_ephjiBFojtbqzgd + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_ephjiBFojtbqzgd +.L_small_initial_partial_block_ephjiBFojtbqzgd: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_ephjiBFojtbqzgd: + + orq %r8,%r8 + je .L_after_reduction_ephjiBFojtbqzgd + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_ephjiBFojtbqzgd: + jmp .L_last_blocks_done_ClvEnqtsgcyzxra +.L_last_num_blocks_is_6_ClvEnqtsgcyzxra: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $250,%r15d + jae .L_16_blocks_overflow_BGjhpBrnvbegsga + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %ymm27,%ymm0,%ymm3 + jmp .L_16_blocks_ok_BGjhpBrnvbegsga + +.L_16_blocks_overflow_BGjhpBrnvbegsga: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %ymm29,%ymm3,%ymm3 +.L_16_blocks_ok_BGjhpBrnvbegsga: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %ymm30,%ymm3,%ymm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %ymm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %ymm29,%ymm19,%ymm19 + vextracti32x4 $1,%zmm19,%xmm7 + subq $16 * (6 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_fcljjovquiEbomB + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_fcljjovquiEbomB +.L_small_initial_partial_block_fcljjovquiEbomB: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_fcljjovquiEbomB: + + orq %r8,%r8 + je .L_after_reduction_fcljjovquiEbomB + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_fcljjovquiEbomB: + jmp .L_last_blocks_done_ClvEnqtsgcyzxra +.L_last_num_blocks_is_7_ClvEnqtsgcyzxra: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $249,%r15d + jae .L_16_blocks_overflow_izrwrwtizdFmmop + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_izrwrwtizdFmmop + +.L_16_blocks_overflow_izrwrwtizdFmmop: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_izrwrwtizdFmmop: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $2,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vextracti32x4 $2,%zmm19,%xmm7 + subq $16 * (7 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_BGxuGiljxiGuGwj + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_BGxuGiljxiGuGwj +.L_small_initial_partial_block_BGxuGiljxiGuGwj: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_BGxuGiljxiGuGwj: + + orq %r8,%r8 + je .L_after_reduction_BGxuGiljxiGuGwj + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_BGxuGiljxiGuGwj: + jmp .L_last_blocks_done_ClvEnqtsgcyzxra +.L_last_num_blocks_is_8_ClvEnqtsgcyzxra: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $248,%r15d + jae .L_16_blocks_overflow_uokAwEtutqrxEoF + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_uokAwEtutqrxEoF + +.L_16_blocks_overflow_uokAwEtutqrxEoF: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_uokAwEtutqrxEoF: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $3,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vextracti32x4 $3,%zmm19,%xmm7 + subq $16 * (8 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_CannrFuxFceaxhk + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_CannrFuxFceaxhk +.L_small_initial_partial_block_CannrFuxFceaxhk: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_CannrFuxFceaxhk: + + orq %r8,%r8 + je .L_after_reduction_CannrFuxFceaxhk + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_CannrFuxFceaxhk: + jmp .L_last_blocks_done_ClvEnqtsgcyzxra +.L_last_num_blocks_is_9_ClvEnqtsgcyzxra: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $247,%r15d + jae .L_16_blocks_overflow_ydCuzccyysxjEtE + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %xmm27,%xmm3,%xmm4 + jmp .L_16_blocks_ok_ydCuzccyysxjEtE + +.L_16_blocks_overflow_ydCuzccyysxjEtE: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %xmm29,%xmm4,%xmm4 +.L_16_blocks_ok_ydCuzccyysxjEtE: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %xmm30,%xmm4,%xmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %xmm20,%xmm4,%xmm4 + vextracti32x4 $0,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %xmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %xmm29,%xmm20,%xmm20 + vextracti32x4 $0,%zmm20,%xmm7 + subq $16 * (9 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_hlxwfcoEeochjmF + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_hlxwfcoEeochjmF +.L_small_initial_partial_block_hlxwfcoEeochjmF: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_hlxwfcoEeochjmF: + + orq %r8,%r8 + je .L_after_reduction_hlxwfcoEeochjmF + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_hlxwfcoEeochjmF: + jmp .L_last_blocks_done_ClvEnqtsgcyzxra +.L_last_num_blocks_is_10_ClvEnqtsgcyzxra: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $246,%r15d + jae .L_16_blocks_overflow_uhxcibFtDluhCCB + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %ymm27,%ymm3,%ymm4 + jmp .L_16_blocks_ok_uhxcibFtDluhCCB + +.L_16_blocks_overflow_uhxcibFtDluhCCB: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %ymm29,%ymm4,%ymm4 +.L_16_blocks_ok_uhxcibFtDluhCCB: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %ymm30,%ymm4,%ymm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %ymm20,%ymm4,%ymm4 + vextracti32x4 $1,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %ymm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %ymm29,%ymm20,%ymm20 + vextracti32x4 $1,%zmm20,%xmm7 + subq $16 * (10 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_uwCCphGGeEaqtbf + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_uwCCphGGeEaqtbf +.L_small_initial_partial_block_uwCCphGGeEaqtbf: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_uwCCphGGeEaqtbf: + + orq %r8,%r8 + je .L_after_reduction_uwCCphGGeEaqtbf + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_uwCCphGGeEaqtbf: + jmp .L_last_blocks_done_ClvEnqtsgcyzxra +.L_last_num_blocks_is_11_ClvEnqtsgcyzxra: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $245,%r15d + jae .L_16_blocks_overflow_ndAbfmoGyFeFtFs + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_ndAbfmoGyFeFtFs + +.L_16_blocks_overflow_ndAbfmoGyFeFtFs: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_ndAbfmoGyFeFtFs: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $2,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vextracti32x4 $2,%zmm20,%xmm7 + subq $16 * (11 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_tojfqqaoGtkzuaq + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_tojfqqaoGtkzuaq +.L_small_initial_partial_block_tojfqqaoGtkzuaq: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_tojfqqaoGtkzuaq: + + orq %r8,%r8 + je .L_after_reduction_tojfqqaoGtkzuaq + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_tojfqqaoGtkzuaq: + jmp .L_last_blocks_done_ClvEnqtsgcyzxra +.L_last_num_blocks_is_12_ClvEnqtsgcyzxra: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $244,%r15d + jae .L_16_blocks_overflow_rwelfyvzphiDsjE + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_rwelfyvzphiDsjE + +.L_16_blocks_overflow_rwelfyvzphiDsjE: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_rwelfyvzphiDsjE: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $3,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vextracti32x4 $3,%zmm20,%xmm7 + subq $16 * (12 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_CzrAuaBADCucxbj + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_CzrAuaBADCucxbj +.L_small_initial_partial_block_CzrAuaBADCucxbj: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_CzrAuaBADCucxbj: + + orq %r8,%r8 + je .L_after_reduction_CzrAuaBADCucxbj + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_CzrAuaBADCucxbj: + jmp .L_last_blocks_done_ClvEnqtsgcyzxra +.L_last_num_blocks_is_13_ClvEnqtsgcyzxra: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $243,%r15d + jae .L_16_blocks_overflow_aizclGCjAeGBapi + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %xmm27,%xmm4,%xmm5 + jmp .L_16_blocks_ok_aizclGCjAeGBapi + +.L_16_blocks_overflow_aizclGCjAeGBapi: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %xmm29,%xmm5,%xmm5 +.L_16_blocks_ok_aizclGCjAeGBapi: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %xmm30,%xmm5,%xmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %xmm21,%xmm5,%xmm5 + vextracti32x4 $0,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %xmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %xmm29,%xmm21,%xmm21 + vextracti32x4 $0,%zmm21,%xmm7 + subq $16 * (13 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_rsvakfaFrrcdnmn + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_rsvakfaFrrcdnmn +.L_small_initial_partial_block_rsvakfaFrrcdnmn: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_rsvakfaFrrcdnmn: + + orq %r8,%r8 + je .L_after_reduction_rsvakfaFrrcdnmn + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_rsvakfaFrrcdnmn: + jmp .L_last_blocks_done_ClvEnqtsgcyzxra +.L_last_num_blocks_is_14_ClvEnqtsgcyzxra: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $242,%r15d + jae .L_16_blocks_overflow_CifFuwhmDnsajva + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %ymm27,%ymm4,%ymm5 + jmp .L_16_blocks_ok_CifFuwhmDnsajva + +.L_16_blocks_overflow_CifFuwhmDnsajva: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %ymm29,%ymm5,%ymm5 +.L_16_blocks_ok_CifFuwhmDnsajva: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %ymm30,%ymm5,%ymm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %ymm21,%ymm5,%ymm5 + vextracti32x4 $1,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %ymm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %ymm29,%ymm21,%ymm21 + vextracti32x4 $1,%zmm21,%xmm7 + subq $16 * (14 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_eAqADtqcmpkizGe + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_eAqADtqcmpkizGe +.L_small_initial_partial_block_eAqADtqcmpkizGe: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_eAqADtqcmpkizGe: + + orq %r8,%r8 + je .L_after_reduction_eAqADtqcmpkizGe + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_eAqADtqcmpkizGe: + jmp .L_last_blocks_done_ClvEnqtsgcyzxra +.L_last_num_blocks_is_15_ClvEnqtsgcyzxra: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $241,%r15d + jae .L_16_blocks_overflow_oiyvxmCxqthGqom + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_oiyvxmCxqthGqom + +.L_16_blocks_overflow_oiyvxmCxqthGqom: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_oiyvxmCxqthGqom: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $2,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %zmm29,%zmm21,%zmm21 + vextracti32x4 $2,%zmm21,%xmm7 + subq $16 * (15 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_ugFbqvmchjEBBBz + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_ugFbqvmchjEBBBz +.L_small_initial_partial_block_ugFbqvmchjEBBBz: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_ugFbqvmchjEBBBz: + + orq %r8,%r8 + je .L_after_reduction_ugFbqvmchjEBBBz + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_ugFbqvmchjEBBBz: + jmp .L_last_blocks_done_ClvEnqtsgcyzxra +.L_last_num_blocks_is_16_ClvEnqtsgcyzxra: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $240,%r15d + jae .L_16_blocks_overflow_wCdnfleczoFcEzf + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_wCdnfleczoFcEzf + +.L_16_blocks_overflow_wCdnfleczoFcEzf: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_wCdnfleczoFcEzf: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $3,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %zmm29,%zmm21,%zmm21 + vextracti32x4 $3,%zmm21,%xmm7 + subq $16 * (16 - 1),%r8 +.L_small_initial_partial_block_qkhBhqDFAyxsceq: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_qkhBhqDFAyxsceq: + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_qkhBhqDFAyxsceq: + jmp .L_last_blocks_done_ClvEnqtsgcyzxra +.L_last_num_blocks_is_0_ClvEnqtsgcyzxra: + vmovdqa64 1024(%rsp),%zmm13 + vmovdqu64 0(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1088(%rsp),%zmm13 + vmovdqu64 64(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 1152(%rsp),%zmm13 + vmovdqu64 128(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1216(%rsp),%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm4 + + + vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0 + vpslldq $8,%xmm0,%xmm0 + vpxorq %xmm0,%xmm25,%xmm0 + + + vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3 + vpsrldq $4,%xmm3,%xmm3 + vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm3,%xmm14 + +.L_last_blocks_done_ClvEnqtsgcyzxra: + vpshufb %xmm29,%xmm2,%xmm2 + jmp .L_ghash_done_kgypzeldFqsBnqw +.L_encrypt_32_blocks_kgypzeldFqsBnqw: + cmpb $240,%r15b + jae .L_16_blocks_overflow_vGiehzfobkckAyi + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_vGiehzfobkckAyi +.L_16_blocks_overflow_vGiehzfobkckAyi: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_vGiehzfobkckAyi: + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp),%zmm1 + + + + + vshufi64x2 $255,%zmm5,%zmm5,%zmm2 + addb $16,%r15b + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + + + + + + + + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm6 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + + + + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm15,%zmm10,%zmm26 + vpxorq %zmm12,%zmm6,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 208(%rdi),%zmm31 + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 224(%rdi),%zmm30 + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + + + + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + + + + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1) + vpshufb %zmm29,%zmm17,%zmm0 + vpshufb %zmm29,%zmm19,%zmm3 + vpshufb %zmm29,%zmm20,%zmm4 + vpshufb %zmm29,%zmm21,%zmm5 + vmovdqa64 %zmm0,1280(%rsp) + vmovdqa64 %zmm3,1344(%rsp) + vmovdqa64 %zmm4,1408(%rsp) + vmovdqa64 %zmm5,1472(%rsp) + cmpb $240,%r15b + jae .L_16_blocks_overflow_aBfhhtmiojjovim + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_aBfhhtmiojjovim +.L_16_blocks_overflow_aBfhhtmiojjovim: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_aBfhhtmiojjovim: + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 256(%rsp),%zmm1 + + + + + vshufi64x2 $255,%zmm5,%zmm5,%zmm2 + addb $16,%r15b + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 320(%rsp),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + + + + + + + + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 384(%rsp),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 448(%rsp),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm6 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + + + + vmovdqu8 256(%rcx,%r11,1),%zmm17 + vmovdqu8 320(%rcx,%r11,1),%zmm19 + vmovdqu8 384(%rcx,%r11,1),%zmm20 + vmovdqu8 448(%rcx,%r11,1),%zmm21 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vpternlogq $0x96,%zmm12,%zmm6,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 208(%rdi),%zmm31 + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 224(%rdi),%zmm30 + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + + + + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + + + + movq %r9,%r10 + vmovdqu8 %zmm0,256(%r10,%r11,1) + vmovdqu8 %zmm3,320(%r10,%r11,1) + vmovdqu8 %zmm4,384(%r10,%r11,1) + vmovdqu8 %zmm5,448(%r10,%r11,1) + vpshufb %zmm29,%zmm17,%zmm0 + vpshufb %zmm29,%zmm19,%zmm3 + vpshufb %zmm29,%zmm20,%zmm4 + vpshufb %zmm29,%zmm21,%zmm5 + vmovdqa64 %zmm0,768(%rsp) + vmovdqa64 %zmm3,832(%rsp) + vmovdqa64 %zmm4,896(%rsp) + vmovdqa64 %zmm5,960(%rsp) + vmovdqa64 1280(%rsp),%zmm13 + vmovdqu64 512(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1344(%rsp),%zmm13 + vmovdqu64 576(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 1408(%rsp),%zmm13 + vmovdqu64 640(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1472(%rsp),%zmm13 + vmovdqu64 704(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm4 + + + vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0 + vpslldq $8,%xmm0,%xmm0 + vpxorq %xmm0,%xmm25,%xmm0 + + + vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3 + vpsrldq $4,%xmm3,%xmm3 + vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm3,%xmm14 + + subq $512,%r8 + addq $512,%r11 + movl %r8d,%r10d + andl $~15,%r10d + movl $512,%ebx + subl %r10d,%ebx + movl %r8d,%r10d + addl $15,%r10d + shrl $4,%r10d + je .L_last_num_blocks_is_0_AwFklinDrcbFgzn + + cmpl $8,%r10d + je .L_last_num_blocks_is_8_AwFklinDrcbFgzn + jb .L_last_num_blocks_is_7_1_AwFklinDrcbFgzn + + + cmpl $12,%r10d + je .L_last_num_blocks_is_12_AwFklinDrcbFgzn + jb .L_last_num_blocks_is_11_9_AwFklinDrcbFgzn + + + cmpl $15,%r10d + je .L_last_num_blocks_is_15_AwFklinDrcbFgzn + ja .L_last_num_blocks_is_16_AwFklinDrcbFgzn + cmpl $14,%r10d + je .L_last_num_blocks_is_14_AwFklinDrcbFgzn + jmp .L_last_num_blocks_is_13_AwFklinDrcbFgzn + +.L_last_num_blocks_is_11_9_AwFklinDrcbFgzn: + + cmpl $10,%r10d + je .L_last_num_blocks_is_10_AwFklinDrcbFgzn + ja .L_last_num_blocks_is_11_AwFklinDrcbFgzn + jmp .L_last_num_blocks_is_9_AwFklinDrcbFgzn + +.L_last_num_blocks_is_7_1_AwFklinDrcbFgzn: + cmpl $4,%r10d + je .L_last_num_blocks_is_4_AwFklinDrcbFgzn + jb .L_last_num_blocks_is_3_1_AwFklinDrcbFgzn + + cmpl $6,%r10d + ja .L_last_num_blocks_is_7_AwFklinDrcbFgzn + je .L_last_num_blocks_is_6_AwFklinDrcbFgzn + jmp .L_last_num_blocks_is_5_AwFklinDrcbFgzn + +.L_last_num_blocks_is_3_1_AwFklinDrcbFgzn: + + cmpl $2,%r10d + ja .L_last_num_blocks_is_3_AwFklinDrcbFgzn + je .L_last_num_blocks_is_2_AwFklinDrcbFgzn +.L_last_num_blocks_is_1_AwFklinDrcbFgzn: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $255,%r15d + jae .L_16_blocks_overflow_FvFeevCgruEuomy + vpaddd %xmm28,%xmm2,%xmm0 + jmp .L_16_blocks_ok_FvFeevCgruEuomy + +.L_16_blocks_overflow_FvFeevCgruEuomy: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %xmm29,%xmm0,%xmm0 +.L_16_blocks_ok_FvFeevCgruEuomy: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z} + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vaesenclast %xmm30,%xmm0,%xmm0 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti32x4 $0,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %xmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %xmm29,%xmm17,%xmm17 + vextracti32x4 $0,%zmm17,%xmm7 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_vocdDxlyexcAqgk + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_vocdDxlyexcAqgk +.L_small_initial_partial_block_vocdDxlyexcAqgk: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm0 + + + vpclmulqdq $0x01,%xmm25,%xmm0,%xmm3 + vpslldq $8,%xmm3,%xmm3 + vpxorq %xmm3,%xmm25,%xmm3 + + + vpclmulqdq $0x00,%xmm3,%xmm0,%xmm4 + vpsrldq $4,%xmm4,%xmm4 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm4,%xmm14 + + + + + + + + + + + + + vpxorq %xmm7,%xmm14,%xmm14 + + jmp .L_after_reduction_vocdDxlyexcAqgk +.L_small_initial_compute_done_vocdDxlyexcAqgk: +.L_after_reduction_vocdDxlyexcAqgk: + jmp .L_last_blocks_done_AwFklinDrcbFgzn +.L_last_num_blocks_is_2_AwFklinDrcbFgzn: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $254,%r15d + jae .L_16_blocks_overflow_rufCyEuzhyCcBum + vpaddd %ymm28,%ymm2,%ymm0 + jmp .L_16_blocks_ok_rufCyEuzhyCcBum + +.L_16_blocks_overflow_rufCyEuzhyCcBum: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %ymm29,%ymm0,%ymm0 +.L_16_blocks_ok_rufCyEuzhyCcBum: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z} + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vaesenclast %ymm30,%ymm0,%ymm0 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %ymm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %ymm29,%ymm17,%ymm17 + vextracti32x4 $1,%zmm17,%xmm7 + subq $16 * (2 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_hFhwFAnywtirqFm + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_hFhwFAnywtirqFm +.L_small_initial_partial_block_hFhwFAnywtirqFm: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_hFhwFAnywtirqFm: + + orq %r8,%r8 + je .L_after_reduction_hFhwFAnywtirqFm + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_hFhwFAnywtirqFm: + jmp .L_last_blocks_done_AwFklinDrcbFgzn +.L_last_num_blocks_is_3_AwFklinDrcbFgzn: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $253,%r15d + jae .L_16_blocks_overflow_oiFAsBBekBeEcll + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_oiFAsBBekBeEcll + +.L_16_blocks_overflow_oiFAsBBekBeEcll: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_oiFAsBBekBeEcll: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $2,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vextracti32x4 $2,%zmm17,%xmm7 + subq $16 * (3 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_DakDxmbzhjsFccp + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_DakDxmbzhjsFccp +.L_small_initial_partial_block_DakDxmbzhjsFccp: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_DakDxmbzhjsFccp: + + orq %r8,%r8 + je .L_after_reduction_DakDxmbzhjsFccp + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_DakDxmbzhjsFccp: + jmp .L_last_blocks_done_AwFklinDrcbFgzn +.L_last_num_blocks_is_4_AwFklinDrcbFgzn: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $252,%r15d + jae .L_16_blocks_overflow_EeBjyjCzBemkiyn + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_EeBjyjCzBemkiyn + +.L_16_blocks_overflow_EeBjyjCzBemkiyn: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_EeBjyjCzBemkiyn: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $3,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vextracti32x4 $3,%zmm17,%xmm7 + subq $16 * (4 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_pkDoGcykctqxwtv + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_pkDoGcykctqxwtv +.L_small_initial_partial_block_pkDoGcykctqxwtv: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_pkDoGcykctqxwtv: + + orq %r8,%r8 + je .L_after_reduction_pkDoGcykctqxwtv + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_pkDoGcykctqxwtv: + jmp .L_last_blocks_done_AwFklinDrcbFgzn +.L_last_num_blocks_is_5_AwFklinDrcbFgzn: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $251,%r15d + jae .L_16_blocks_overflow_ygonEcumvGgxonp + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %xmm27,%xmm0,%xmm3 + jmp .L_16_blocks_ok_ygonEcumvGgxonp + +.L_16_blocks_overflow_ygonEcumvGgxonp: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %xmm29,%xmm3,%xmm3 +.L_16_blocks_ok_ygonEcumvGgxonp: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %xmm30,%xmm3,%xmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %xmm19,%xmm3,%xmm3 + vextracti32x4 $0,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %xmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %xmm29,%xmm19,%xmm19 + vextracti32x4 $0,%zmm19,%xmm7 + subq $16 * (5 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_FBDnovehzAhxoFz + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_FBDnovehzAhxoFz +.L_small_initial_partial_block_FBDnovehzAhxoFz: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_FBDnovehzAhxoFz: + + orq %r8,%r8 + je .L_after_reduction_FBDnovehzAhxoFz + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_FBDnovehzAhxoFz: + jmp .L_last_blocks_done_AwFklinDrcbFgzn +.L_last_num_blocks_is_6_AwFklinDrcbFgzn: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $250,%r15d + jae .L_16_blocks_overflow_zAwamddcsGuDbsw + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %ymm27,%ymm0,%ymm3 + jmp .L_16_blocks_ok_zAwamddcsGuDbsw + +.L_16_blocks_overflow_zAwamddcsGuDbsw: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %ymm29,%ymm3,%ymm3 +.L_16_blocks_ok_zAwamddcsGuDbsw: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %ymm30,%ymm3,%ymm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %ymm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %ymm29,%ymm19,%ymm19 + vextracti32x4 $1,%zmm19,%xmm7 + subq $16 * (6 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_nBiEFoifDnlnCnA + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_nBiEFoifDnlnCnA +.L_small_initial_partial_block_nBiEFoifDnlnCnA: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_nBiEFoifDnlnCnA: + + orq %r8,%r8 + je .L_after_reduction_nBiEFoifDnlnCnA + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_nBiEFoifDnlnCnA: + jmp .L_last_blocks_done_AwFklinDrcbFgzn +.L_last_num_blocks_is_7_AwFklinDrcbFgzn: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $249,%r15d + jae .L_16_blocks_overflow_pwBmqBGFfnBFiBx + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_pwBmqBGFfnBFiBx + +.L_16_blocks_overflow_pwBmqBGFfnBFiBx: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_pwBmqBGFfnBFiBx: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $2,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vextracti32x4 $2,%zmm19,%xmm7 + subq $16 * (7 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_wChogqeEderiszq + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_wChogqeEderiszq +.L_small_initial_partial_block_wChogqeEderiszq: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_wChogqeEderiszq: + + orq %r8,%r8 + je .L_after_reduction_wChogqeEderiszq + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_wChogqeEderiszq: + jmp .L_last_blocks_done_AwFklinDrcbFgzn +.L_last_num_blocks_is_8_AwFklinDrcbFgzn: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $248,%r15d + jae .L_16_blocks_overflow_xgcteGoksvqdvwC + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_xgcteGoksvqdvwC + +.L_16_blocks_overflow_xgcteGoksvqdvwC: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_xgcteGoksvqdvwC: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $3,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vextracti32x4 $3,%zmm19,%xmm7 + subq $16 * (8 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_bwfvAfrqwqvnlGG + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_bwfvAfrqwqvnlGG +.L_small_initial_partial_block_bwfvAfrqwqvnlGG: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_bwfvAfrqwqvnlGG: + + orq %r8,%r8 + je .L_after_reduction_bwfvAfrqwqvnlGG + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_bwfvAfrqwqvnlGG: + jmp .L_last_blocks_done_AwFklinDrcbFgzn +.L_last_num_blocks_is_9_AwFklinDrcbFgzn: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $247,%r15d + jae .L_16_blocks_overflow_nGFogvFjmdjnsvt + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %xmm27,%xmm3,%xmm4 + jmp .L_16_blocks_ok_nGFogvFjmdjnsvt + +.L_16_blocks_overflow_nGFogvFjmdjnsvt: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %xmm29,%xmm4,%xmm4 +.L_16_blocks_ok_nGFogvFjmdjnsvt: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %xmm30,%xmm4,%xmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %xmm20,%xmm4,%xmm4 + vextracti32x4 $0,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %xmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %xmm29,%xmm20,%xmm20 + vextracti32x4 $0,%zmm20,%xmm7 + subq $16 * (9 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_pkinwzuhxhaEgCa + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_pkinwzuhxhaEgCa +.L_small_initial_partial_block_pkinwzuhxhaEgCa: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_pkinwzuhxhaEgCa: + + orq %r8,%r8 + je .L_after_reduction_pkinwzuhxhaEgCa + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_pkinwzuhxhaEgCa: + jmp .L_last_blocks_done_AwFklinDrcbFgzn +.L_last_num_blocks_is_10_AwFklinDrcbFgzn: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $246,%r15d + jae .L_16_blocks_overflow_ryszgunyrqgvyfB + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %ymm27,%ymm3,%ymm4 + jmp .L_16_blocks_ok_ryszgunyrqgvyfB + +.L_16_blocks_overflow_ryszgunyrqgvyfB: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %ymm29,%ymm4,%ymm4 +.L_16_blocks_ok_ryszgunyrqgvyfB: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %ymm30,%ymm4,%ymm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %ymm20,%ymm4,%ymm4 + vextracti32x4 $1,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %ymm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %ymm29,%ymm20,%ymm20 + vextracti32x4 $1,%zmm20,%xmm7 + subq $16 * (10 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_jypDCauhjquEuyb + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_jypDCauhjquEuyb +.L_small_initial_partial_block_jypDCauhjquEuyb: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_jypDCauhjquEuyb: + + orq %r8,%r8 + je .L_after_reduction_jypDCauhjquEuyb + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_jypDCauhjquEuyb: + jmp .L_last_blocks_done_AwFklinDrcbFgzn +.L_last_num_blocks_is_11_AwFklinDrcbFgzn: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $245,%r15d + jae .L_16_blocks_overflow_DvudExkamyfuGdv + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_DvudExkamyfuGdv + +.L_16_blocks_overflow_DvudExkamyfuGdv: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_DvudExkamyfuGdv: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $2,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vextracti32x4 $2,%zmm20,%xmm7 + subq $16 * (11 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_dlfpdlkfExhwjDu + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_dlfpdlkfExhwjDu +.L_small_initial_partial_block_dlfpdlkfExhwjDu: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_dlfpdlkfExhwjDu: + + orq %r8,%r8 + je .L_after_reduction_dlfpdlkfExhwjDu + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_dlfpdlkfExhwjDu: + jmp .L_last_blocks_done_AwFklinDrcbFgzn +.L_last_num_blocks_is_12_AwFklinDrcbFgzn: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $244,%r15d + jae .L_16_blocks_overflow_pycvwiovDfFylBw + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_pycvwiovDfFylBw + +.L_16_blocks_overflow_pycvwiovDfFylBw: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_pycvwiovDfFylBw: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $3,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vextracti32x4 $3,%zmm20,%xmm7 + subq $16 * (12 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_DazlrGdgfFiEaoe + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_DazlrGdgfFiEaoe +.L_small_initial_partial_block_DazlrGdgfFiEaoe: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_DazlrGdgfFiEaoe: + + orq %r8,%r8 + je .L_after_reduction_DazlrGdgfFiEaoe + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_DazlrGdgfFiEaoe: + jmp .L_last_blocks_done_AwFklinDrcbFgzn +.L_last_num_blocks_is_13_AwFklinDrcbFgzn: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $243,%r15d + jae .L_16_blocks_overflow_sFwEGaAnGxDowcc + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %xmm27,%xmm4,%xmm5 + jmp .L_16_blocks_ok_sFwEGaAnGxDowcc + +.L_16_blocks_overflow_sFwEGaAnGxDowcc: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %xmm29,%xmm5,%xmm5 +.L_16_blocks_ok_sFwEGaAnGxDowcc: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %xmm30,%xmm5,%xmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %xmm21,%xmm5,%xmm5 + vextracti32x4 $0,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %xmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %xmm29,%xmm21,%xmm21 + vextracti32x4 $0,%zmm21,%xmm7 + subq $16 * (13 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_tohyxsArdntzjGo + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_tohyxsArdntzjGo +.L_small_initial_partial_block_tohyxsArdntzjGo: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_tohyxsArdntzjGo: + + orq %r8,%r8 + je .L_after_reduction_tohyxsArdntzjGo + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_tohyxsArdntzjGo: + jmp .L_last_blocks_done_AwFklinDrcbFgzn +.L_last_num_blocks_is_14_AwFklinDrcbFgzn: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $242,%r15d + jae .L_16_blocks_overflow_fapGrcjmuhklgzo + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %ymm27,%ymm4,%ymm5 + jmp .L_16_blocks_ok_fapGrcjmuhklgzo + +.L_16_blocks_overflow_fapGrcjmuhklgzo: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %ymm29,%ymm5,%ymm5 +.L_16_blocks_ok_fapGrcjmuhklgzo: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %ymm30,%ymm5,%ymm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %ymm21,%ymm5,%ymm5 + vextracti32x4 $1,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %ymm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %ymm29,%ymm21,%ymm21 + vextracti32x4 $1,%zmm21,%xmm7 + subq $16 * (14 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_BeFutuwFnozaige + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_BeFutuwFnozaige +.L_small_initial_partial_block_BeFutuwFnozaige: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_BeFutuwFnozaige: + + orq %r8,%r8 + je .L_after_reduction_BeFutuwFnozaige + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_BeFutuwFnozaige: + jmp .L_last_blocks_done_AwFklinDrcbFgzn +.L_last_num_blocks_is_15_AwFklinDrcbFgzn: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $241,%r15d + jae .L_16_blocks_overflow_aByDeEDFBCjvqGx + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_aByDeEDFBCjvqGx + +.L_16_blocks_overflow_aByDeEDFBCjvqGx: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_aByDeEDFBCjvqGx: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $2,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %zmm29,%zmm21,%zmm21 + vextracti32x4 $2,%zmm21,%xmm7 + subq $16 * (15 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_hAxtmivtdwAsvmz + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_hAxtmivtdwAsvmz +.L_small_initial_partial_block_hAxtmivtdwAsvmz: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_hAxtmivtdwAsvmz: + + orq %r8,%r8 + je .L_after_reduction_hAxtmivtdwAsvmz + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_hAxtmivtdwAsvmz: + jmp .L_last_blocks_done_AwFklinDrcbFgzn +.L_last_num_blocks_is_16_AwFklinDrcbFgzn: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $240,%r15d + jae .L_16_blocks_overflow_BwrcaiuzmxchdBE + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_BwrcaiuzmxchdBE + +.L_16_blocks_overflow_BwrcaiuzmxchdBE: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_BwrcaiuzmxchdBE: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $3,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %zmm29,%zmm21,%zmm21 + vextracti32x4 $3,%zmm21,%xmm7 + subq $16 * (16 - 1),%r8 +.L_small_initial_partial_block_xniaaigktwmycDh: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_xniaaigktwmycDh: + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_xniaaigktwmycDh: + jmp .L_last_blocks_done_AwFklinDrcbFgzn +.L_last_num_blocks_is_0_AwFklinDrcbFgzn: + vmovdqa64 768(%rsp),%zmm13 + vpxorq %zmm14,%zmm13,%zmm13 + vmovdqu64 0(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 832(%rsp),%zmm13 + vmovdqu64 64(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpxorq %zmm10,%zmm4,%zmm26 + vpxorq %zmm6,%zmm0,%zmm24 + vpxorq %zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 896(%rsp),%zmm13 + vmovdqu64 128(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 960(%rsp),%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm4 + + + vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0 + vpslldq $8,%xmm0,%xmm0 + vpxorq %xmm0,%xmm25,%xmm0 + + + vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3 + vpsrldq $4,%xmm3,%xmm3 + vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm3,%xmm14 + +.L_last_blocks_done_AwFklinDrcbFgzn: + vpshufb %xmm29,%xmm2,%xmm2 + jmp .L_ghash_done_kgypzeldFqsBnqw +.L_encrypt_16_blocks_kgypzeldFqsBnqw: + cmpb $240,%r15b + jae .L_16_blocks_overflow_itlreegehzzFvho + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_itlreegehzzFvho +.L_16_blocks_overflow_itlreegehzzFvho: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_itlreegehzzFvho: + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp),%zmm1 + + + + + vshufi64x2 $255,%zmm5,%zmm5,%zmm2 + addb $16,%r15b + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + + + + + + + + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm6 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + + + + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm15,%zmm10,%zmm26 + vpxorq %zmm12,%zmm6,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 208(%rdi),%zmm31 + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 224(%rdi),%zmm30 + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + + + + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + + + + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1) + vpshufb %zmm29,%zmm17,%zmm0 + vpshufb %zmm29,%zmm19,%zmm3 + vpshufb %zmm29,%zmm20,%zmm4 + vpshufb %zmm29,%zmm21,%zmm5 + vmovdqa64 %zmm0,1280(%rsp) + vmovdqa64 %zmm3,1344(%rsp) + vmovdqa64 %zmm4,1408(%rsp) + vmovdqa64 %zmm5,1472(%rsp) + vmovdqa64 1024(%rsp),%zmm13 + vmovdqu64 256(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1088(%rsp),%zmm13 + vmovdqu64 320(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 1152(%rsp),%zmm13 + vmovdqu64 384(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1216(%rsp),%zmm13 + vmovdqu64 448(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + subq $256,%r8 + addq $256,%r11 + movl %r8d,%r10d + addl $15,%r10d + shrl $4,%r10d + je .L_last_num_blocks_is_0_xAfbdFbjfoyBlDz + + cmpl $8,%r10d + je .L_last_num_blocks_is_8_xAfbdFbjfoyBlDz + jb .L_last_num_blocks_is_7_1_xAfbdFbjfoyBlDz + + + cmpl $12,%r10d + je .L_last_num_blocks_is_12_xAfbdFbjfoyBlDz + jb .L_last_num_blocks_is_11_9_xAfbdFbjfoyBlDz + + + cmpl $15,%r10d + je .L_last_num_blocks_is_15_xAfbdFbjfoyBlDz + ja .L_last_num_blocks_is_16_xAfbdFbjfoyBlDz + cmpl $14,%r10d + je .L_last_num_blocks_is_14_xAfbdFbjfoyBlDz + jmp .L_last_num_blocks_is_13_xAfbdFbjfoyBlDz + +.L_last_num_blocks_is_11_9_xAfbdFbjfoyBlDz: + + cmpl $10,%r10d + je .L_last_num_blocks_is_10_xAfbdFbjfoyBlDz + ja .L_last_num_blocks_is_11_xAfbdFbjfoyBlDz + jmp .L_last_num_blocks_is_9_xAfbdFbjfoyBlDz + +.L_last_num_blocks_is_7_1_xAfbdFbjfoyBlDz: + cmpl $4,%r10d + je .L_last_num_blocks_is_4_xAfbdFbjfoyBlDz + jb .L_last_num_blocks_is_3_1_xAfbdFbjfoyBlDz + + cmpl $6,%r10d + ja .L_last_num_blocks_is_7_xAfbdFbjfoyBlDz + je .L_last_num_blocks_is_6_xAfbdFbjfoyBlDz + jmp .L_last_num_blocks_is_5_xAfbdFbjfoyBlDz + +.L_last_num_blocks_is_3_1_xAfbdFbjfoyBlDz: + + cmpl $2,%r10d + ja .L_last_num_blocks_is_3_xAfbdFbjfoyBlDz + je .L_last_num_blocks_is_2_xAfbdFbjfoyBlDz +.L_last_num_blocks_is_1_xAfbdFbjfoyBlDz: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $255,%r15d + jae .L_16_blocks_overflow_lapolqbccExufla + vpaddd %xmm28,%xmm2,%xmm0 + jmp .L_16_blocks_ok_lapolqbccExufla + +.L_16_blocks_overflow_lapolqbccExufla: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %xmm29,%xmm0,%xmm0 +.L_16_blocks_ok_lapolqbccExufla: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $0,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z} + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %xmm31,%xmm0,%xmm0 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %xmm30,%xmm0,%xmm0 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti32x4 $0,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %xmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %xmm29,%xmm17,%xmm17 + vextracti32x4 $0,%zmm17,%xmm7 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_aksayyCEvBwkqCs + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_aksayyCEvBwkqCs +.L_small_initial_partial_block_aksayyCEvBwkqCs: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + + + + + + + + + + + + vpxorq %xmm7,%xmm14,%xmm14 + + jmp .L_after_reduction_aksayyCEvBwkqCs +.L_small_initial_compute_done_aksayyCEvBwkqCs: +.L_after_reduction_aksayyCEvBwkqCs: + jmp .L_last_blocks_done_xAfbdFbjfoyBlDz +.L_last_num_blocks_is_2_xAfbdFbjfoyBlDz: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $254,%r15d + jae .L_16_blocks_overflow_EnCCsEpwCxDywbA + vpaddd %ymm28,%ymm2,%ymm0 + jmp .L_16_blocks_ok_EnCCsEpwCxDywbA + +.L_16_blocks_overflow_EnCCsEpwCxDywbA: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %ymm29,%ymm0,%ymm0 +.L_16_blocks_ok_EnCCsEpwCxDywbA: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $1,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z} + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %ymm31,%ymm0,%ymm0 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %ymm30,%ymm0,%ymm0 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %ymm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %ymm29,%ymm17,%ymm17 + vextracti32x4 $1,%zmm17,%xmm7 + subq $16 * (2 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_enwlcwbgseiBryB + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_enwlcwbgseiBryB +.L_small_initial_partial_block_enwlcwbgseiBryB: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_enwlcwbgseiBryB: + + orq %r8,%r8 + je .L_after_reduction_enwlcwbgseiBryB + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_enwlcwbgseiBryB: + jmp .L_last_blocks_done_xAfbdFbjfoyBlDz +.L_last_num_blocks_is_3_xAfbdFbjfoyBlDz: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $253,%r15d + jae .L_16_blocks_overflow_bEsbraEgeohwpzz + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_bEsbraEgeohwpzz + +.L_16_blocks_overflow_bEsbraEgeohwpzz: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_bEsbraEgeohwpzz: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $2,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $2,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vextracti32x4 $2,%zmm17,%xmm7 + subq $16 * (3 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_jrkEfawFjAdFFAw + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_jrkEfawFjAdFFAw +.L_small_initial_partial_block_jrkEfawFjAdFFAw: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_jrkEfawFjAdFFAw: + + orq %r8,%r8 + je .L_after_reduction_jrkEfawFjAdFFAw + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_jrkEfawFjAdFFAw: + jmp .L_last_blocks_done_xAfbdFbjfoyBlDz +.L_last_num_blocks_is_4_xAfbdFbjfoyBlDz: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $252,%r15d + jae .L_16_blocks_overflow_jxvxvtaszlAuveu + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_jxvxvtaszlAuveu + +.L_16_blocks_overflow_jxvxvtaszlAuveu: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_jxvxvtaszlAuveu: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $3,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $3,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vextracti32x4 $3,%zmm17,%xmm7 + subq $16 * (4 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_BoECtwduirkpGbd + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_BoECtwduirkpGbd +.L_small_initial_partial_block_BoECtwduirkpGbd: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_BoECtwduirkpGbd: + + orq %r8,%r8 + je .L_after_reduction_BoECtwduirkpGbd + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_BoECtwduirkpGbd: + jmp .L_last_blocks_done_xAfbdFbjfoyBlDz +.L_last_num_blocks_is_5_xAfbdFbjfoyBlDz: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $251,%r15d + jae .L_16_blocks_overflow_AemnsnzilvGaDvl + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %xmm27,%xmm0,%xmm3 + jmp .L_16_blocks_ok_AemnsnzilvGaDvl + +.L_16_blocks_overflow_AemnsnzilvGaDvl: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %xmm29,%xmm3,%xmm3 +.L_16_blocks_ok_AemnsnzilvGaDvl: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $0,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %xmm30,%xmm3,%xmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %xmm19,%xmm3,%xmm3 + vextracti32x4 $0,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %xmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %xmm29,%xmm19,%xmm19 + vextracti32x4 $0,%zmm19,%xmm7 + subq $16 * (5 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_AChbnzckEtGqvia + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_AChbnzckEtGqvia +.L_small_initial_partial_block_AChbnzckEtGqvia: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_AChbnzckEtGqvia: + + orq %r8,%r8 + je .L_after_reduction_AChbnzckEtGqvia + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_AChbnzckEtGqvia: + jmp .L_last_blocks_done_xAfbdFbjfoyBlDz +.L_last_num_blocks_is_6_xAfbdFbjfoyBlDz: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $250,%r15d + jae .L_16_blocks_overflow_pGnpmuquowsenAC + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %ymm27,%ymm0,%ymm3 + jmp .L_16_blocks_ok_pGnpmuquowsenAC + +.L_16_blocks_overflow_pGnpmuquowsenAC: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %ymm29,%ymm3,%ymm3 +.L_16_blocks_ok_pGnpmuquowsenAC: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $1,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %ymm30,%ymm3,%ymm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %ymm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %ymm29,%ymm19,%ymm19 + vextracti32x4 $1,%zmm19,%xmm7 + subq $16 * (6 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_kcatvpdGCtefzAw + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_kcatvpdGCtefzAw +.L_small_initial_partial_block_kcatvpdGCtefzAw: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_kcatvpdGCtefzAw: + + orq %r8,%r8 + je .L_after_reduction_kcatvpdGCtefzAw + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_kcatvpdGCtefzAw: + jmp .L_last_blocks_done_xAfbdFbjfoyBlDz +.L_last_num_blocks_is_7_xAfbdFbjfoyBlDz: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $249,%r15d + jae .L_16_blocks_overflow_vBcFztzloamdDFg + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_vBcFztzloamdDFg + +.L_16_blocks_overflow_vBcFztzloamdDFg: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_vBcFztzloamdDFg: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $2,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $2,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vextracti32x4 $2,%zmm19,%xmm7 + subq $16 * (7 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_yfFcsqkvhbddwyy + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_yfFcsqkvhbddwyy +.L_small_initial_partial_block_yfFcsqkvhbddwyy: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_yfFcsqkvhbddwyy: + + orq %r8,%r8 + je .L_after_reduction_yfFcsqkvhbddwyy + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_yfFcsqkvhbddwyy: + jmp .L_last_blocks_done_xAfbdFbjfoyBlDz +.L_last_num_blocks_is_8_xAfbdFbjfoyBlDz: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $248,%r15d + jae .L_16_blocks_overflow_FdAnkzzirEtjwrb + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_FdAnkzzirEtjwrb + +.L_16_blocks_overflow_FdAnkzzirEtjwrb: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_FdAnkzzirEtjwrb: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $3,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $3,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vextracti32x4 $3,%zmm19,%xmm7 + subq $16 * (8 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_wvyqkgDlqezddls + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_wvyqkgDlqezddls +.L_small_initial_partial_block_wvyqkgDlqezddls: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_wvyqkgDlqezddls: + + orq %r8,%r8 + je .L_after_reduction_wvyqkgDlqezddls + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_wvyqkgDlqezddls: + jmp .L_last_blocks_done_xAfbdFbjfoyBlDz +.L_last_num_blocks_is_9_xAfbdFbjfoyBlDz: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $247,%r15d + jae .L_16_blocks_overflow_lhtDngmdlssnvDG + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %xmm27,%xmm3,%xmm4 + jmp .L_16_blocks_ok_lhtDngmdlssnvDG + +.L_16_blocks_overflow_lhtDngmdlssnvDG: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %xmm29,%xmm4,%xmm4 +.L_16_blocks_ok_lhtDngmdlssnvDG: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $0,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %xmm30,%xmm4,%xmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %xmm20,%xmm4,%xmm4 + vextracti32x4 $0,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %xmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %xmm29,%xmm20,%xmm20 + vextracti32x4 $0,%zmm20,%xmm7 + subq $16 * (9 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_ptjDGBmufbAkAGG + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_ptjDGBmufbAkAGG +.L_small_initial_partial_block_ptjDGBmufbAkAGG: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_ptjDGBmufbAkAGG: + + orq %r8,%r8 + je .L_after_reduction_ptjDGBmufbAkAGG + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_ptjDGBmufbAkAGG: + jmp .L_last_blocks_done_xAfbdFbjfoyBlDz +.L_last_num_blocks_is_10_xAfbdFbjfoyBlDz: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $246,%r15d + jae .L_16_blocks_overflow_wsaFiGmrqxypimt + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %ymm27,%ymm3,%ymm4 + jmp .L_16_blocks_ok_wsaFiGmrqxypimt + +.L_16_blocks_overflow_wsaFiGmrqxypimt: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %ymm29,%ymm4,%ymm4 +.L_16_blocks_ok_wsaFiGmrqxypimt: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $1,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %ymm30,%ymm4,%ymm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %ymm20,%ymm4,%ymm4 + vextracti32x4 $1,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %ymm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %ymm29,%ymm20,%ymm20 + vextracti32x4 $1,%zmm20,%xmm7 + subq $16 * (10 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_gnctxlhtglgbgvx + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_gnctxlhtglgbgvx +.L_small_initial_partial_block_gnctxlhtglgbgvx: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_gnctxlhtglgbgvx: + + orq %r8,%r8 + je .L_after_reduction_gnctxlhtglgbgvx + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_gnctxlhtglgbgvx: + jmp .L_last_blocks_done_xAfbdFbjfoyBlDz +.L_last_num_blocks_is_11_xAfbdFbjfoyBlDz: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $245,%r15d + jae .L_16_blocks_overflow_neydhuxthowjDfe + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_neydhuxthowjDfe + +.L_16_blocks_overflow_neydhuxthowjDfe: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_neydhuxthowjDfe: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $2,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $2,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vextracti32x4 $2,%zmm20,%xmm7 + subq $16 * (11 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_btfsxwwBfubFEhw + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_btfsxwwBfubFEhw +.L_small_initial_partial_block_btfsxwwBfubFEhw: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_btfsxwwBfubFEhw: + + orq %r8,%r8 + je .L_after_reduction_btfsxwwBfubFEhw + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_btfsxwwBfubFEhw: + jmp .L_last_blocks_done_xAfbdFbjfoyBlDz +.L_last_num_blocks_is_12_xAfbdFbjfoyBlDz: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $244,%r15d + jae .L_16_blocks_overflow_vmmvFmFAAqpDrjc + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_vmmvFmFAAqpDrjc + +.L_16_blocks_overflow_vmmvFmFAAqpDrjc: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_vmmvFmFAAqpDrjc: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $3,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $3,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vextracti32x4 $3,%zmm20,%xmm7 + subq $16 * (12 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_muxxrlxFvpCuucj + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_muxxrlxFvpCuucj +.L_small_initial_partial_block_muxxrlxFvpCuucj: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_muxxrlxFvpCuucj: + + orq %r8,%r8 + je .L_after_reduction_muxxrlxFvpCuucj + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_muxxrlxFvpCuucj: + jmp .L_last_blocks_done_xAfbdFbjfoyBlDz +.L_last_num_blocks_is_13_xAfbdFbjfoyBlDz: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $243,%r15d + jae .L_16_blocks_overflow_BtCEtGboibyzmkz + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %xmm27,%xmm4,%xmm5 + jmp .L_16_blocks_ok_BtCEtGboibyzmkz + +.L_16_blocks_overflow_BtCEtGboibyzmkz: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %xmm29,%xmm5,%xmm5 +.L_16_blocks_ok_BtCEtGboibyzmkz: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $0,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %xmm30,%xmm5,%xmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %xmm21,%xmm5,%xmm5 + vextracti32x4 $0,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %xmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %xmm29,%xmm21,%xmm21 + vextracti32x4 $0,%zmm21,%xmm7 + subq $16 * (13 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_niubrurEemqlCeh + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_niubrurEemqlCeh +.L_small_initial_partial_block_niubrurEemqlCeh: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_niubrurEemqlCeh: + + orq %r8,%r8 + je .L_after_reduction_niubrurEemqlCeh + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_niubrurEemqlCeh: + jmp .L_last_blocks_done_xAfbdFbjfoyBlDz +.L_last_num_blocks_is_14_xAfbdFbjfoyBlDz: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $242,%r15d + jae .L_16_blocks_overflow_mybAsEhdaxgnGrE + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %ymm27,%ymm4,%ymm5 + jmp .L_16_blocks_ok_mybAsEhdaxgnGrE + +.L_16_blocks_overflow_mybAsEhdaxgnGrE: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %ymm29,%ymm5,%ymm5 +.L_16_blocks_ok_mybAsEhdaxgnGrE: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $1,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %ymm30,%ymm5,%ymm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %ymm21,%ymm5,%ymm5 + vextracti32x4 $1,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %ymm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %ymm29,%ymm21,%ymm21 + vextracti32x4 $1,%zmm21,%xmm7 + subq $16 * (14 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_qtDEunzdagagyyt + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_qtDEunzdagagyyt +.L_small_initial_partial_block_qtDEunzdagagyyt: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_qtDEunzdagagyyt: + + orq %r8,%r8 + je .L_after_reduction_qtDEunzdagagyyt + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_qtDEunzdagagyyt: + jmp .L_last_blocks_done_xAfbdFbjfoyBlDz +.L_last_num_blocks_is_15_xAfbdFbjfoyBlDz: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $241,%r15d + jae .L_16_blocks_overflow_Bofftlllstcnhmp + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_Bofftlllstcnhmp + +.L_16_blocks_overflow_Bofftlllstcnhmp: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_Bofftlllstcnhmp: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $2,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $2,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %zmm29,%zmm21,%zmm21 + vextracti32x4 $2,%zmm21,%xmm7 + subq $16 * (15 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_ahcvvxeChlezaBm + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_ahcvvxeChlezaBm +.L_small_initial_partial_block_ahcvvxeChlezaBm: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_ahcvvxeChlezaBm: + + orq %r8,%r8 + je .L_after_reduction_ahcvvxeChlezaBm + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_ahcvvxeChlezaBm: + jmp .L_last_blocks_done_xAfbdFbjfoyBlDz +.L_last_num_blocks_is_16_xAfbdFbjfoyBlDz: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $240,%r15d + jae .L_16_blocks_overflow_nowrnsGGyachzjc + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_nowrnsGGyachzjc + +.L_16_blocks_overflow_nowrnsGGyachzjc: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_nowrnsGGyachzjc: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $3,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $3,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %zmm29,%zmm21,%zmm21 + vextracti32x4 $3,%zmm21,%xmm7 + subq $16 * (16 - 1),%r8 +.L_small_initial_partial_block_AoBCchcjotapvgu: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_AoBCchcjotapvgu: + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_AoBCchcjotapvgu: + jmp .L_last_blocks_done_xAfbdFbjfoyBlDz +.L_last_num_blocks_is_0_xAfbdFbjfoyBlDz: + vmovdqa64 1280(%rsp),%zmm13 + vmovdqu64 512(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1344(%rsp),%zmm13 + vmovdqu64 576(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 1408(%rsp),%zmm13 + vmovdqu64 640(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1472(%rsp),%zmm13 + vmovdqu64 704(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm4 + + + vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0 + vpslldq $8,%xmm0,%xmm0 + vpxorq %xmm0,%xmm25,%xmm0 + + + vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3 + vpsrldq $4,%xmm3,%xmm3 + vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm3,%xmm14 + +.L_last_blocks_done_xAfbdFbjfoyBlDz: + vpshufb %xmm29,%xmm2,%xmm2 + jmp .L_ghash_done_kgypzeldFqsBnqw + +.L_message_below_32_blocks_kgypzeldFqsBnqw: + + + subq $256,%r8 + addq $256,%r11 + movl %r8d,%r10d + testq %r14,%r14 + jnz .L_skip_hkeys_precomputation_qckdlimbBeqylyq + vmovdqu64 640(%rsp),%zmm3 + + + vshufi64x2 $0x00,%zmm3,%zmm3,%zmm3 + + vmovdqu64 576(%rsp),%zmm4 + vmovdqu64 512(%rsp),%zmm5 + + vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4 + vpxorq %zmm10,%zmm4,%zmm4 + + vpsrldq $8,%zmm4,%zmm10 + vpslldq $8,%zmm4,%zmm4 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4 + vpslldq $4,%zmm4,%zmm4 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm4 + + vmovdqu64 %zmm4,448(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5 + vpxorq %zmm10,%zmm5,%zmm5 + + vpsrldq $8,%zmm5,%zmm10 + vpslldq $8,%zmm5,%zmm5 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5 + vpslldq $4,%zmm5,%zmm5 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm5 + + vmovdqu64 %zmm5,384(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4 + vpxorq %zmm10,%zmm4,%zmm4 + + vpsrldq $8,%zmm4,%zmm10 + vpslldq $8,%zmm4,%zmm4 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4 + vpslldq $4,%zmm4,%zmm4 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm4 + + vmovdqu64 %zmm4,320(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5 + vpxorq %zmm10,%zmm5,%zmm5 + + vpsrldq $8,%zmm5,%zmm10 + vpslldq $8,%zmm5,%zmm5 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5 + vpslldq $4,%zmm5,%zmm5 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm5 + + vmovdqu64 %zmm5,256(%rsp) +.L_skip_hkeys_precomputation_qckdlimbBeqylyq: + movq $1,%r14 + andl $~15,%r10d + movl $512,%ebx + subl %r10d,%ebx + movl %r8d,%r10d + addl $15,%r10d + shrl $4,%r10d + je .L_last_num_blocks_is_0_qdswuDcxyhGmasp + + cmpl $8,%r10d + je .L_last_num_blocks_is_8_qdswuDcxyhGmasp + jb .L_last_num_blocks_is_7_1_qdswuDcxyhGmasp + + + cmpl $12,%r10d + je .L_last_num_blocks_is_12_qdswuDcxyhGmasp + jb .L_last_num_blocks_is_11_9_qdswuDcxyhGmasp + + + cmpl $15,%r10d + je .L_last_num_blocks_is_15_qdswuDcxyhGmasp + ja .L_last_num_blocks_is_16_qdswuDcxyhGmasp + cmpl $14,%r10d + je .L_last_num_blocks_is_14_qdswuDcxyhGmasp + jmp .L_last_num_blocks_is_13_qdswuDcxyhGmasp + +.L_last_num_blocks_is_11_9_qdswuDcxyhGmasp: + + cmpl $10,%r10d + je .L_last_num_blocks_is_10_qdswuDcxyhGmasp + ja .L_last_num_blocks_is_11_qdswuDcxyhGmasp + jmp .L_last_num_blocks_is_9_qdswuDcxyhGmasp + +.L_last_num_blocks_is_7_1_qdswuDcxyhGmasp: + cmpl $4,%r10d + je .L_last_num_blocks_is_4_qdswuDcxyhGmasp + jb .L_last_num_blocks_is_3_1_qdswuDcxyhGmasp + + cmpl $6,%r10d + ja .L_last_num_blocks_is_7_qdswuDcxyhGmasp + je .L_last_num_blocks_is_6_qdswuDcxyhGmasp + jmp .L_last_num_blocks_is_5_qdswuDcxyhGmasp + +.L_last_num_blocks_is_3_1_qdswuDcxyhGmasp: + + cmpl $2,%r10d + ja .L_last_num_blocks_is_3_qdswuDcxyhGmasp + je .L_last_num_blocks_is_2_qdswuDcxyhGmasp +.L_last_num_blocks_is_1_qdswuDcxyhGmasp: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $255,%r15d + jae .L_16_blocks_overflow_AqvkjwfuBmvGzFo + vpaddd %xmm28,%xmm2,%xmm0 + jmp .L_16_blocks_ok_AqvkjwfuBmvGzFo + +.L_16_blocks_overflow_AqvkjwfuBmvGzFo: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %xmm29,%xmm0,%xmm0 +.L_16_blocks_ok_AqvkjwfuBmvGzFo: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z} + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vaesenclast %xmm30,%xmm0,%xmm0 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti32x4 $0,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %xmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %xmm29,%xmm17,%xmm17 + vextracti32x4 $0,%zmm17,%xmm7 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_zDugdiozxlCaAFc + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_zDugdiozxlCaAFc +.L_small_initial_partial_block_zDugdiozxlCaAFc: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm0 + + + vpclmulqdq $0x01,%xmm25,%xmm0,%xmm3 + vpslldq $8,%xmm3,%xmm3 + vpxorq %xmm3,%xmm25,%xmm3 + + + vpclmulqdq $0x00,%xmm3,%xmm0,%xmm4 + vpsrldq $4,%xmm4,%xmm4 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm4,%xmm14 + + + + + + + + + + + + + vpxorq %xmm7,%xmm14,%xmm14 + + jmp .L_after_reduction_zDugdiozxlCaAFc +.L_small_initial_compute_done_zDugdiozxlCaAFc: +.L_after_reduction_zDugdiozxlCaAFc: + jmp .L_last_blocks_done_qdswuDcxyhGmasp +.L_last_num_blocks_is_2_qdswuDcxyhGmasp: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $254,%r15d + jae .L_16_blocks_overflow_BFBqcyfExFAkGzj + vpaddd %ymm28,%ymm2,%ymm0 + jmp .L_16_blocks_ok_BFBqcyfExFAkGzj + +.L_16_blocks_overflow_BFBqcyfExFAkGzj: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %ymm29,%ymm0,%ymm0 +.L_16_blocks_ok_BFBqcyfExFAkGzj: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z} + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vaesenclast %ymm30,%ymm0,%ymm0 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %ymm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %ymm29,%ymm17,%ymm17 + vextracti32x4 $1,%zmm17,%xmm7 + subq $16 * (2 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_bgisyxAEeEpkobG + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_bgisyxAEeEpkobG +.L_small_initial_partial_block_bgisyxAEeEpkobG: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_bgisyxAEeEpkobG: + + orq %r8,%r8 + je .L_after_reduction_bgisyxAEeEpkobG + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_bgisyxAEeEpkobG: + jmp .L_last_blocks_done_qdswuDcxyhGmasp +.L_last_num_blocks_is_3_qdswuDcxyhGmasp: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $253,%r15d + jae .L_16_blocks_overflow_yizvcDtiefGCDev + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_yizvcDtiefGCDev + +.L_16_blocks_overflow_yizvcDtiefGCDev: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_yizvcDtiefGCDev: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $2,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vextracti32x4 $2,%zmm17,%xmm7 + subq $16 * (3 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_fegyzcDscsgdCgo + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_fegyzcDscsgdCgo +.L_small_initial_partial_block_fegyzcDscsgdCgo: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_fegyzcDscsgdCgo: + + orq %r8,%r8 + je .L_after_reduction_fegyzcDscsgdCgo + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_fegyzcDscsgdCgo: + jmp .L_last_blocks_done_qdswuDcxyhGmasp +.L_last_num_blocks_is_4_qdswuDcxyhGmasp: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $252,%r15d + jae .L_16_blocks_overflow_DGjzymFiusiuxvc + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_DGjzymFiusiuxvc + +.L_16_blocks_overflow_DGjzymFiusiuxvc: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_DGjzymFiusiuxvc: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $3,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vextracti32x4 $3,%zmm17,%xmm7 + subq $16 * (4 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_DyGAAdrBpclAjrf + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_DyGAAdrBpclAjrf +.L_small_initial_partial_block_DyGAAdrBpclAjrf: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_DyGAAdrBpclAjrf: + + orq %r8,%r8 + je .L_after_reduction_DyGAAdrBpclAjrf + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_DyGAAdrBpclAjrf: + jmp .L_last_blocks_done_qdswuDcxyhGmasp +.L_last_num_blocks_is_5_qdswuDcxyhGmasp: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $251,%r15d + jae .L_16_blocks_overflow_qmnbjAabAnlrekx + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %xmm27,%xmm0,%xmm3 + jmp .L_16_blocks_ok_qmnbjAabAnlrekx + +.L_16_blocks_overflow_qmnbjAabAnlrekx: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %xmm29,%xmm3,%xmm3 +.L_16_blocks_ok_qmnbjAabAnlrekx: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %xmm30,%xmm3,%xmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %xmm19,%xmm3,%xmm3 + vextracti32x4 $0,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %xmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %xmm29,%xmm19,%xmm19 + vextracti32x4 $0,%zmm19,%xmm7 + subq $16 * (5 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_qdgqavzegrGAAjz + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_qdgqavzegrGAAjz +.L_small_initial_partial_block_qdgqavzegrGAAjz: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_qdgqavzegrGAAjz: + + orq %r8,%r8 + je .L_after_reduction_qdgqavzegrGAAjz + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_qdgqavzegrGAAjz: + jmp .L_last_blocks_done_qdswuDcxyhGmasp +.L_last_num_blocks_is_6_qdswuDcxyhGmasp: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $250,%r15d + jae .L_16_blocks_overflow_AkAddilhnCabyyf + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %ymm27,%ymm0,%ymm3 + jmp .L_16_blocks_ok_AkAddilhnCabyyf + +.L_16_blocks_overflow_AkAddilhnCabyyf: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %ymm29,%ymm3,%ymm3 +.L_16_blocks_ok_AkAddilhnCabyyf: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %ymm30,%ymm3,%ymm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %ymm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %ymm29,%ymm19,%ymm19 + vextracti32x4 $1,%zmm19,%xmm7 + subq $16 * (6 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_iibprCbqDlikAnd + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_iibprCbqDlikAnd +.L_small_initial_partial_block_iibprCbqDlikAnd: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_iibprCbqDlikAnd: + + orq %r8,%r8 + je .L_after_reduction_iibprCbqDlikAnd + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_iibprCbqDlikAnd: + jmp .L_last_blocks_done_qdswuDcxyhGmasp +.L_last_num_blocks_is_7_qdswuDcxyhGmasp: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $249,%r15d + jae .L_16_blocks_overflow_lxvhGbsbefzGdxF + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_lxvhGbsbefzGdxF + +.L_16_blocks_overflow_lxvhGbsbefzGdxF: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_lxvhGbsbefzGdxF: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $2,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vextracti32x4 $2,%zmm19,%xmm7 + subq $16 * (7 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_GthoECEdfcnGsvc + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_GthoECEdfcnGsvc +.L_small_initial_partial_block_GthoECEdfcnGsvc: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_GthoECEdfcnGsvc: + + orq %r8,%r8 + je .L_after_reduction_GthoECEdfcnGsvc + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_GthoECEdfcnGsvc: + jmp .L_last_blocks_done_qdswuDcxyhGmasp +.L_last_num_blocks_is_8_qdswuDcxyhGmasp: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $248,%r15d + jae .L_16_blocks_overflow_qwiyktwmAFnlrAv + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_qwiyktwmAFnlrAv + +.L_16_blocks_overflow_qwiyktwmAFnlrAv: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_qwiyktwmAFnlrAv: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $3,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vextracti32x4 $3,%zmm19,%xmm7 + subq $16 * (8 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_hBGcauuiubbhsmg + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_hBGcauuiubbhsmg +.L_small_initial_partial_block_hBGcauuiubbhsmg: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_hBGcauuiubbhsmg: + + orq %r8,%r8 + je .L_after_reduction_hBGcauuiubbhsmg + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_hBGcauuiubbhsmg: + jmp .L_last_blocks_done_qdswuDcxyhGmasp +.L_last_num_blocks_is_9_qdswuDcxyhGmasp: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $247,%r15d + jae .L_16_blocks_overflow_Aahazrycncacmjd + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %xmm27,%xmm3,%xmm4 + jmp .L_16_blocks_ok_Aahazrycncacmjd + +.L_16_blocks_overflow_Aahazrycncacmjd: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %xmm29,%xmm4,%xmm4 +.L_16_blocks_ok_Aahazrycncacmjd: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %xmm30,%xmm4,%xmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %xmm20,%xmm4,%xmm4 + vextracti32x4 $0,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %xmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %xmm29,%xmm20,%xmm20 + vextracti32x4 $0,%zmm20,%xmm7 + subq $16 * (9 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_xijDGphAfrrjvcn + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_xijDGphAfrrjvcn +.L_small_initial_partial_block_xijDGphAfrrjvcn: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_xijDGphAfrrjvcn: + + orq %r8,%r8 + je .L_after_reduction_xijDGphAfrrjvcn + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_xijDGphAfrrjvcn: + jmp .L_last_blocks_done_qdswuDcxyhGmasp +.L_last_num_blocks_is_10_qdswuDcxyhGmasp: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $246,%r15d + jae .L_16_blocks_overflow_hkbadvpbxvroayG + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %ymm27,%ymm3,%ymm4 + jmp .L_16_blocks_ok_hkbadvpbxvroayG + +.L_16_blocks_overflow_hkbadvpbxvroayG: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %ymm29,%ymm4,%ymm4 +.L_16_blocks_ok_hkbadvpbxvroayG: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %ymm30,%ymm4,%ymm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %ymm20,%ymm4,%ymm4 + vextracti32x4 $1,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %ymm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %ymm29,%ymm20,%ymm20 + vextracti32x4 $1,%zmm20,%xmm7 + subq $16 * (10 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_oahmBbxzjdosefa + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_oahmBbxzjdosefa +.L_small_initial_partial_block_oahmBbxzjdosefa: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_oahmBbxzjdosefa: + + orq %r8,%r8 + je .L_after_reduction_oahmBbxzjdosefa + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_oahmBbxzjdosefa: + jmp .L_last_blocks_done_qdswuDcxyhGmasp +.L_last_num_blocks_is_11_qdswuDcxyhGmasp: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $245,%r15d + jae .L_16_blocks_overflow_FsdwrjvehsptDBd + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_FsdwrjvehsptDBd + +.L_16_blocks_overflow_FsdwrjvehsptDBd: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_FsdwrjvehsptDBd: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $2,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vextracti32x4 $2,%zmm20,%xmm7 + subq $16 * (11 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_yodgBeqbEhheCDd + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_yodgBeqbEhheCDd +.L_small_initial_partial_block_yodgBeqbEhheCDd: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_yodgBeqbEhheCDd: + + orq %r8,%r8 + je .L_after_reduction_yodgBeqbEhheCDd + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_yodgBeqbEhheCDd: + jmp .L_last_blocks_done_qdswuDcxyhGmasp +.L_last_num_blocks_is_12_qdswuDcxyhGmasp: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $244,%r15d + jae .L_16_blocks_overflow_thkeiGylBuuojur + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_thkeiGylBuuojur + +.L_16_blocks_overflow_thkeiGylBuuojur: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_thkeiGylBuuojur: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $3,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vextracti32x4 $3,%zmm20,%xmm7 + subq $16 * (12 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_yzbzfadAzvvaytc + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_yzbzfadAzvvaytc +.L_small_initial_partial_block_yzbzfadAzvvaytc: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_yzbzfadAzvvaytc: + + orq %r8,%r8 + je .L_after_reduction_yzbzfadAzvvaytc + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_yzbzfadAzvvaytc: + jmp .L_last_blocks_done_qdswuDcxyhGmasp +.L_last_num_blocks_is_13_qdswuDcxyhGmasp: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $243,%r15d + jae .L_16_blocks_overflow_eFxvoygBEBGohmA + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %xmm27,%xmm4,%xmm5 + jmp .L_16_blocks_ok_eFxvoygBEBGohmA + +.L_16_blocks_overflow_eFxvoygBEBGohmA: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %xmm29,%xmm5,%xmm5 +.L_16_blocks_ok_eFxvoygBEBGohmA: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %xmm30,%xmm5,%xmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %xmm21,%xmm5,%xmm5 + vextracti32x4 $0,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %xmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %xmm29,%xmm21,%xmm21 + vextracti32x4 $0,%zmm21,%xmm7 + subq $16 * (13 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_zzewAuyevyjoCwC + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_zzewAuyevyjoCwC +.L_small_initial_partial_block_zzewAuyevyjoCwC: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_zzewAuyevyjoCwC: + + orq %r8,%r8 + je .L_after_reduction_zzewAuyevyjoCwC + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_zzewAuyevyjoCwC: + jmp .L_last_blocks_done_qdswuDcxyhGmasp +.L_last_num_blocks_is_14_qdswuDcxyhGmasp: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $242,%r15d + jae .L_16_blocks_overflow_wcubmfDtExvnDlb + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %ymm27,%ymm4,%ymm5 + jmp .L_16_blocks_ok_wcubmfDtExvnDlb + +.L_16_blocks_overflow_wcubmfDtExvnDlb: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %ymm29,%ymm5,%ymm5 +.L_16_blocks_ok_wcubmfDtExvnDlb: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %ymm30,%ymm5,%ymm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %ymm21,%ymm5,%ymm5 + vextracti32x4 $1,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %ymm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %ymm29,%ymm21,%ymm21 + vextracti32x4 $1,%zmm21,%xmm7 + subq $16 * (14 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_wbcvGrEDxndwxqw + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_wbcvGrEDxndwxqw +.L_small_initial_partial_block_wbcvGrEDxndwxqw: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_wbcvGrEDxndwxqw: + + orq %r8,%r8 + je .L_after_reduction_wbcvGrEDxndwxqw + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_wbcvGrEDxndwxqw: + jmp .L_last_blocks_done_qdswuDcxyhGmasp +.L_last_num_blocks_is_15_qdswuDcxyhGmasp: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $241,%r15d + jae .L_16_blocks_overflow_hDvByfpahyymzEv + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_hDvByfpahyymzEv + +.L_16_blocks_overflow_hDvByfpahyymzEv: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_hDvByfpahyymzEv: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $2,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %zmm29,%zmm21,%zmm21 + vextracti32x4 $2,%zmm21,%xmm7 + subq $16 * (15 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_uAckhsjfbEBxdkE + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_uAckhsjfbEBxdkE +.L_small_initial_partial_block_uAckhsjfbEBxdkE: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_uAckhsjfbEBxdkE: + + orq %r8,%r8 + je .L_after_reduction_uAckhsjfbEBxdkE + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_uAckhsjfbEBxdkE: + jmp .L_last_blocks_done_qdswuDcxyhGmasp +.L_last_num_blocks_is_16_qdswuDcxyhGmasp: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $240,%r15d + jae .L_16_blocks_overflow_rnhelBbtegFkzjj + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_rnhelBbtegFkzjj + +.L_16_blocks_overflow_rnhelBbtegFkzjj: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_rnhelBbtegFkzjj: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $3,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %zmm29,%zmm21,%zmm21 + vextracti32x4 $3,%zmm21,%xmm7 + subq $16 * (16 - 1),%r8 +.L_small_initial_partial_block_wEgqnyhjgyEjfkm: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_wEgqnyhjgyEjfkm: + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_wEgqnyhjgyEjfkm: + jmp .L_last_blocks_done_qdswuDcxyhGmasp +.L_last_num_blocks_is_0_qdswuDcxyhGmasp: + vmovdqa64 768(%rsp),%zmm13 + vpxorq %zmm14,%zmm13,%zmm13 + vmovdqu64 0(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 832(%rsp),%zmm13 + vmovdqu64 64(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpxorq %zmm10,%zmm4,%zmm26 + vpxorq %zmm6,%zmm0,%zmm24 + vpxorq %zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 896(%rsp),%zmm13 + vmovdqu64 128(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 960(%rsp),%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm4 + + + vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0 + vpslldq $8,%xmm0,%xmm0 + vpxorq %xmm0,%xmm25,%xmm0 + + + vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3 + vpsrldq $4,%xmm3,%xmm3 + vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm3,%xmm14 + +.L_last_blocks_done_qdswuDcxyhGmasp: + vpshufb %xmm29,%xmm2,%xmm2 + jmp .L_ghash_done_kgypzeldFqsBnqw + +.L_message_below_equal_16_blocks_kgypzeldFqsBnqw: + + + movl %r8d,%r12d + addl $15,%r12d + shrl $4,%r12d + cmpq $8,%r12 + je .L_small_initial_num_blocks_is_8_uBFzjxzanCsxGGe + jl .L_small_initial_num_blocks_is_7_1_uBFzjxzanCsxGGe + + + cmpq $12,%r12 + je .L_small_initial_num_blocks_is_12_uBFzjxzanCsxGGe + jl .L_small_initial_num_blocks_is_11_9_uBFzjxzanCsxGGe + + + cmpq $16,%r12 + je .L_small_initial_num_blocks_is_16_uBFzjxzanCsxGGe + cmpq $15,%r12 + je .L_small_initial_num_blocks_is_15_uBFzjxzanCsxGGe + cmpq $14,%r12 + je .L_small_initial_num_blocks_is_14_uBFzjxzanCsxGGe + jmp .L_small_initial_num_blocks_is_13_uBFzjxzanCsxGGe + +.L_small_initial_num_blocks_is_11_9_uBFzjxzanCsxGGe: + + cmpq $11,%r12 + je .L_small_initial_num_blocks_is_11_uBFzjxzanCsxGGe + cmpq $10,%r12 + je .L_small_initial_num_blocks_is_10_uBFzjxzanCsxGGe + jmp .L_small_initial_num_blocks_is_9_uBFzjxzanCsxGGe + +.L_small_initial_num_blocks_is_7_1_uBFzjxzanCsxGGe: + cmpq $4,%r12 + je .L_small_initial_num_blocks_is_4_uBFzjxzanCsxGGe + jl .L_small_initial_num_blocks_is_3_1_uBFzjxzanCsxGGe + + cmpq $7,%r12 + je .L_small_initial_num_blocks_is_7_uBFzjxzanCsxGGe + cmpq $6,%r12 + je .L_small_initial_num_blocks_is_6_uBFzjxzanCsxGGe + jmp .L_small_initial_num_blocks_is_5_uBFzjxzanCsxGGe + +.L_small_initial_num_blocks_is_3_1_uBFzjxzanCsxGGe: + + cmpq $3,%r12 + je .L_small_initial_num_blocks_is_3_uBFzjxzanCsxGGe + cmpq $2,%r12 + je .L_small_initial_num_blocks_is_2_uBFzjxzanCsxGGe + + + + + +.L_small_initial_num_blocks_is_1_uBFzjxzanCsxGGe: + vmovdqa64 SHUF_MASK(%rip),%xmm29 + vpaddd ONE(%rip),%xmm2,%xmm0 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $0,%zmm0,%xmm2 + vpshufb %xmm29,%xmm0,%xmm0 + vmovdqu8 0(%rcx,%r11,1),%xmm6{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 208(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 224(%rdi),%zmm15 + vaesenclast %xmm15,%xmm0,%xmm0 + vpxorq %xmm6,%xmm0,%xmm0 + vextracti32x4 $0,%zmm0,%xmm12 + movq %r9,%r10 + vmovdqu8 %xmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %xmm29,%xmm6,%xmm6 + vextracti32x4 $0,%zmm6,%xmm13 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_usvkeoywsioAnfD + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm6,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm6,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm6,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm6,%xmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_usvkeoywsioAnfD +.L_small_initial_partial_block_usvkeoywsioAnfD: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + + + + + + + + + + + + vpxorq %xmm13,%xmm14,%xmm14 + + jmp .L_after_reduction_usvkeoywsioAnfD +.L_small_initial_compute_done_usvkeoywsioAnfD: +.L_after_reduction_usvkeoywsioAnfD: + jmp .L_small_initial_blocks_encrypted_uBFzjxzanCsxGGe +.L_small_initial_num_blocks_is_2_uBFzjxzanCsxGGe: + vmovdqa64 SHUF_MASK(%rip),%ymm29 + vshufi64x2 $0,%ymm2,%ymm2,%ymm0 + vpaddd ddq_add_1234(%rip),%ymm0,%ymm0 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $1,%zmm0,%xmm2 + vpshufb %ymm29,%ymm0,%ymm0 + vmovdqu8 0(%rcx,%r11,1),%ymm6{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 208(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 224(%rdi),%zmm15 + vaesenclast %ymm15,%ymm0,%ymm0 + vpxorq %ymm6,%ymm0,%ymm0 + vextracti32x4 $1,%zmm0,%xmm12 + movq %r9,%r10 + vmovdqu8 %ymm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %ymm29,%ymm6,%ymm6 + vextracti32x4 $1,%zmm6,%xmm13 + subq $16 * (2 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_yvjeqFrhsrkxcss + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm6,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm6,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm6,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm6,%ymm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_yvjeqFrhsrkxcss +.L_small_initial_partial_block_yvjeqFrhsrkxcss: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm6,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm6,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm6,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm6,%xmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_yvjeqFrhsrkxcss: + + orq %r8,%r8 + je .L_after_reduction_yvjeqFrhsrkxcss + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_yvjeqFrhsrkxcss: + jmp .L_small_initial_blocks_encrypted_uBFzjxzanCsxGGe +.L_small_initial_num_blocks_is_3_uBFzjxzanCsxGGe: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $2,%zmm0,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vmovdqu8 0(%rcx,%r11,1),%zmm6{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 208(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 224(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vpxorq %zmm6,%zmm0,%zmm0 + vextracti32x4 $2,%zmm0,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vextracti32x4 $2,%zmm6,%xmm13 + subq $16 * (3 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_mvdynCrzwGwegAr + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_mvdynCrzwGwegAr +.L_small_initial_partial_block_mvdynCrzwGwegAr: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm6,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm6,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm6,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm6,%ymm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_mvdynCrzwGwegAr: + + orq %r8,%r8 + je .L_after_reduction_mvdynCrzwGwegAr + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_mvdynCrzwGwegAr: + jmp .L_small_initial_blocks_encrypted_uBFzjxzanCsxGGe +.L_small_initial_num_blocks_is_4_uBFzjxzanCsxGGe: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $3,%zmm0,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vmovdqu8 0(%rcx,%r11,1),%zmm6{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 208(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 224(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vpxorq %zmm6,%zmm0,%zmm0 + vextracti32x4 $3,%zmm0,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vextracti32x4 $3,%zmm6,%xmm13 + subq $16 * (4 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_pjDzAfyivuABgdr + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 288(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + + vpxorq %zmm19,%zmm17,%zmm17 + vpsrldq $8,%zmm17,%zmm4 + vpslldq $8,%zmm17,%zmm5 + vpxorq %zmm4,%zmm15,%zmm0 + vpxorq %zmm5,%zmm16,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_pjDzAfyivuABgdr +.L_small_initial_partial_block_pjDzAfyivuABgdr: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_pjDzAfyivuABgdr: + + orq %r8,%r8 + je .L_after_reduction_pjDzAfyivuABgdr + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_pjDzAfyivuABgdr: + jmp .L_small_initial_blocks_encrypted_uBFzjxzanCsxGGe +.L_small_initial_num_blocks_is_5_uBFzjxzanCsxGGe: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $64,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $0,%zmm3,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %xmm29,%xmm3,%xmm3 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%xmm7{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 208(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 224(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %xmm15,%xmm3,%xmm3 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %xmm7,%xmm3,%xmm3 + vextracti32x4 $0,%zmm3,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %xmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vpshufb %xmm29,%xmm7,%xmm7 + vextracti32x4 $0,%zmm7,%xmm13 + subq $16 * (5 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_fcBludqftzBwbAa + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 272(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm7,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm7,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm7,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm7,%xmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_fcBludqftzBwbAa +.L_small_initial_partial_block_fcBludqftzBwbAa: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 288(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + + vpxorq %zmm19,%zmm17,%zmm17 + vpsrldq $8,%zmm17,%zmm4 + vpslldq $8,%zmm17,%zmm5 + vpxorq %zmm4,%zmm15,%zmm0 + vpxorq %zmm5,%zmm16,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_fcBludqftzBwbAa: + + orq %r8,%r8 + je .L_after_reduction_fcBludqftzBwbAa + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_fcBludqftzBwbAa: + jmp .L_small_initial_blocks_encrypted_uBFzjxzanCsxGGe +.L_small_initial_num_blocks_is_6_uBFzjxzanCsxGGe: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $64,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $1,%zmm3,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %ymm29,%ymm3,%ymm3 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%ymm7{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 208(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 224(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %ymm15,%ymm3,%ymm3 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %ymm7,%ymm3,%ymm3 + vextracti32x4 $1,%zmm3,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %ymm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vpshufb %ymm29,%ymm7,%ymm7 + vextracti32x4 $1,%zmm7,%xmm13 + subq $16 * (6 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_gpklsvBmbaGumBx + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 256(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm7,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm7,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm7,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm7,%ymm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_gpklsvBmbaGumBx +.L_small_initial_partial_block_gpklsvBmbaGumBx: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 272(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm7,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm7,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm7,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm7,%xmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_gpklsvBmbaGumBx: + + orq %r8,%r8 + je .L_after_reduction_gpklsvBmbaGumBx + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_gpklsvBmbaGumBx: + jmp .L_small_initial_blocks_encrypted_uBFzjxzanCsxGGe +.L_small_initial_num_blocks_is_7_uBFzjxzanCsxGGe: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $64,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $2,%zmm3,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 208(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 224(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vextracti32x4 $2,%zmm3,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vpshufb %zmm29,%zmm7,%zmm7 + vextracti32x4 $2,%zmm7,%xmm13 + subq $16 * (7 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_fFxDDorEtzfbsCi + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 240(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_fFxDDorEtzfbsCi +.L_small_initial_partial_block_fFxDDorEtzfbsCi: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 256(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm7,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm7,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm7,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm7,%ymm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_fFxDDorEtzfbsCi: + + orq %r8,%r8 + je .L_after_reduction_fFxDDorEtzfbsCi + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_fFxDDorEtzfbsCi: + jmp .L_small_initial_blocks_encrypted_uBFzjxzanCsxGGe +.L_small_initial_num_blocks_is_8_uBFzjxzanCsxGGe: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $64,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $3,%zmm3,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 208(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 224(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vextracti32x4 $3,%zmm3,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vpshufb %zmm29,%zmm7,%zmm7 + vextracti32x4 $3,%zmm7,%xmm13 + subq $16 * (8 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_mhgromrjcFpqAxA + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 224(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 288(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + + vpxorq %zmm19,%zmm17,%zmm17 + vpsrldq $8,%zmm17,%zmm4 + vpslldq $8,%zmm17,%zmm5 + vpxorq %zmm4,%zmm15,%zmm0 + vpxorq %zmm5,%zmm16,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_mhgromrjcFpqAxA +.L_small_initial_partial_block_mhgromrjcFpqAxA: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 240(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_mhgromrjcFpqAxA: + + orq %r8,%r8 + je .L_after_reduction_mhgromrjcFpqAxA + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_mhgromrjcFpqAxA: + jmp .L_small_initial_blocks_encrypted_uBFzjxzanCsxGGe +.L_small_initial_num_blocks_is_9_uBFzjxzanCsxGGe: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $128,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $0,%zmm4,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %xmm29,%xmm4,%xmm4 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%xmm10{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 208(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 224(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %xmm15,%xmm4,%xmm4 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %xmm10,%xmm4,%xmm4 + vextracti32x4 $0,%zmm4,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %xmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %xmm29,%xmm10,%xmm10 + vextracti32x4 $0,%zmm10,%xmm13 + subq $16 * (9 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_eghzedifwilpnEF + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 208(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 272(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm10,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm10,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm10,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm10,%xmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_eghzedifwilpnEF +.L_small_initial_partial_block_eghzedifwilpnEF: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 224(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 288(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + + vpxorq %zmm19,%zmm17,%zmm17 + vpsrldq $8,%zmm17,%zmm4 + vpslldq $8,%zmm17,%zmm5 + vpxorq %zmm4,%zmm15,%zmm0 + vpxorq %zmm5,%zmm16,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_eghzedifwilpnEF: + + orq %r8,%r8 + je .L_after_reduction_eghzedifwilpnEF + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_eghzedifwilpnEF: + jmp .L_small_initial_blocks_encrypted_uBFzjxzanCsxGGe +.L_small_initial_num_blocks_is_10_uBFzjxzanCsxGGe: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $128,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $1,%zmm4,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %ymm29,%ymm4,%ymm4 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%ymm10{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 208(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 224(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %ymm15,%ymm4,%ymm4 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %ymm10,%ymm4,%ymm4 + vextracti32x4 $1,%zmm4,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %ymm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %ymm29,%ymm10,%ymm10 + vextracti32x4 $1,%zmm10,%xmm13 + subq $16 * (10 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_aBEqcFFmwBplgFE + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 192(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 256(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm10,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm10,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm10,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm10,%ymm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_aBEqcFFmwBplgFE +.L_small_initial_partial_block_aBEqcFFmwBplgFE: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 208(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 272(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm10,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm10,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm10,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm10,%xmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_aBEqcFFmwBplgFE: + + orq %r8,%r8 + je .L_after_reduction_aBEqcFFmwBplgFE + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_aBEqcFFmwBplgFE: + jmp .L_small_initial_blocks_encrypted_uBFzjxzanCsxGGe +.L_small_initial_num_blocks_is_11_uBFzjxzanCsxGGe: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $128,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $2,%zmm4,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%zmm10{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 208(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 224(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %zmm15,%zmm4,%zmm4 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %zmm10,%zmm4,%zmm4 + vextracti32x4 $2,%zmm4,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %zmm29,%zmm10,%zmm10 + vextracti32x4 $2,%zmm10,%xmm13 + subq $16 * (11 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_ozteDdAwrbobDia + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 176(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 240(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_ozteDdAwrbobDia +.L_small_initial_partial_block_ozteDdAwrbobDia: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 192(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 256(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm10,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm10,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm10,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm10,%ymm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_ozteDdAwrbobDia: + + orq %r8,%r8 + je .L_after_reduction_ozteDdAwrbobDia + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_ozteDdAwrbobDia: + jmp .L_small_initial_blocks_encrypted_uBFzjxzanCsxGGe +.L_small_initial_num_blocks_is_12_uBFzjxzanCsxGGe: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $128,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $3,%zmm4,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%zmm10{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 208(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 224(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %zmm15,%zmm4,%zmm4 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %zmm10,%zmm4,%zmm4 + vextracti32x4 $3,%zmm4,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %zmm29,%zmm10,%zmm10 + vextracti32x4 $3,%zmm10,%xmm13 + subq $16 * (12 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_xaldGCCAFmcudnD + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 160(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 224(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 288(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + + vpxorq %zmm19,%zmm17,%zmm17 + vpsrldq $8,%zmm17,%zmm4 + vpslldq $8,%zmm17,%zmm5 + vpxorq %zmm4,%zmm15,%zmm0 + vpxorq %zmm5,%zmm16,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_xaldGCCAFmcudnD +.L_small_initial_partial_block_xaldGCCAFmcudnD: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 176(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 240(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_xaldGCCAFmcudnD: + + orq %r8,%r8 + je .L_after_reduction_xaldGCCAFmcudnD + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_xaldGCCAFmcudnD: + jmp .L_small_initial_blocks_encrypted_uBFzjxzanCsxGGe +.L_small_initial_num_blocks_is_13_uBFzjxzanCsxGGe: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + vpaddd ddq_add_8888(%rip),%zmm3,%zmm5 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $192,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $0,%zmm5,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %xmm29,%xmm5,%xmm5 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%zmm10 + vmovdqu8 192(%rcx,%r11,1),%xmm11{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %zmm15,%zmm4,%zmm4 + vpxorq %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 208(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 224(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %zmm15,%zmm4,%zmm4 + vaesenclast %xmm15,%xmm5,%xmm5 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %zmm10,%zmm4,%zmm4 + vpxorq %xmm11,%xmm5,%xmm5 + vextracti32x4 $0,%zmm5,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %xmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %zmm29,%zmm10,%zmm10 + vpshufb %xmm29,%xmm11,%xmm11 + vextracti32x4 $0,%zmm11,%xmm13 + subq $16 * (13 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_txhExvepwglFbiC + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 144(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 208(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 272(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm11,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm11,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm11,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm11,%xmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_txhExvepwglFbiC +.L_small_initial_partial_block_txhExvepwglFbiC: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 160(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 224(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 288(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + + vpxorq %zmm19,%zmm17,%zmm17 + vpsrldq $8,%zmm17,%zmm4 + vpslldq $8,%zmm17,%zmm5 + vpxorq %zmm4,%zmm15,%zmm0 + vpxorq %zmm5,%zmm16,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_txhExvepwglFbiC: + + orq %r8,%r8 + je .L_after_reduction_txhExvepwglFbiC + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_txhExvepwglFbiC: + jmp .L_small_initial_blocks_encrypted_uBFzjxzanCsxGGe +.L_small_initial_num_blocks_is_14_uBFzjxzanCsxGGe: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + vpaddd ddq_add_8888(%rip),%zmm3,%zmm5 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $192,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $1,%zmm5,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %ymm29,%ymm5,%ymm5 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%zmm10 + vmovdqu8 192(%rcx,%r11,1),%ymm11{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %zmm15,%zmm4,%zmm4 + vpxorq %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 208(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 224(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %zmm15,%zmm4,%zmm4 + vaesenclast %ymm15,%ymm5,%ymm5 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %zmm10,%zmm4,%zmm4 + vpxorq %ymm11,%ymm5,%ymm5 + vextracti32x4 $1,%zmm5,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %ymm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %zmm29,%zmm10,%zmm10 + vpshufb %ymm29,%ymm11,%ymm11 + vextracti32x4 $1,%zmm11,%xmm13 + subq $16 * (14 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_usDayEFvfwmlydb + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 128(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 192(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 256(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm11,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm11,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm11,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm11,%ymm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_usDayEFvfwmlydb +.L_small_initial_partial_block_usDayEFvfwmlydb: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 144(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 208(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 272(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm11,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm11,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm11,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm11,%xmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_usDayEFvfwmlydb: + + orq %r8,%r8 + je .L_after_reduction_usDayEFvfwmlydb + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_usDayEFvfwmlydb: + jmp .L_small_initial_blocks_encrypted_uBFzjxzanCsxGGe +.L_small_initial_num_blocks_is_15_uBFzjxzanCsxGGe: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + vpaddd ddq_add_8888(%rip),%zmm3,%zmm5 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $192,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $2,%zmm5,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%zmm10 + vmovdqu8 192(%rcx,%r11,1),%zmm11{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %zmm15,%zmm4,%zmm4 + vpxorq %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 208(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 224(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %zmm15,%zmm4,%zmm4 + vaesenclast %zmm15,%zmm5,%zmm5 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %zmm10,%zmm4,%zmm4 + vpxorq %zmm11,%zmm5,%zmm5 + vextracti32x4 $2,%zmm5,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %zmm29,%zmm10,%zmm10 + vpshufb %zmm29,%zmm11,%zmm11 + vextracti32x4 $2,%zmm11,%xmm13 + subq $16 * (15 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_DrCACnmarBwymye + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 112(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 176(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 240(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm11,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm11,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm11,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm11,%zmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_DrCACnmarBwymye +.L_small_initial_partial_block_DrCACnmarBwymye: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 128(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 192(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 256(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm11,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm11,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm11,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm11,%ymm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_DrCACnmarBwymye: + + orq %r8,%r8 + je .L_after_reduction_DrCACnmarBwymye + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_DrCACnmarBwymye: + jmp .L_small_initial_blocks_encrypted_uBFzjxzanCsxGGe +.L_small_initial_num_blocks_is_16_uBFzjxzanCsxGGe: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + vpaddd ddq_add_8888(%rip),%zmm3,%zmm5 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $192,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $3,%zmm5,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%zmm10 + vmovdqu8 192(%rcx,%r11,1),%zmm11{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %zmm15,%zmm4,%zmm4 + vpxorq %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 208(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 224(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %zmm15,%zmm4,%zmm4 + vaesenclast %zmm15,%zmm5,%zmm5 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %zmm10,%zmm4,%zmm4 + vpxorq %zmm11,%zmm5,%zmm5 + vextracti32x4 $3,%zmm5,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %zmm29,%zmm10,%zmm10 + vpshufb %zmm29,%zmm11,%zmm11 + vextracti32x4 $3,%zmm11,%xmm13 + subq $16 * (16 - 1),%r8 +.L_small_initial_partial_block_khwfpcqckgAmFnr: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 112(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 176(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 240(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm11,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm11,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm11,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm11,%zmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_khwfpcqckgAmFnr: + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_khwfpcqckgAmFnr: +.L_small_initial_blocks_encrypted_uBFzjxzanCsxGGe: +.L_ghash_done_kgypzeldFqsBnqw: + vmovdqu64 %xmm2,0(%rsi) + vmovdqu64 %xmm14,64(%rsi) +.L_enc_dec_done_kgypzeldFqsBnqw: + jmp .Lexit_gcm_decrypt +.Lexit_gcm_decrypt: + cmpq $256,%r8 + jbe .Lskip_hkeys_cleanup_cdrboBdzwmggbeq + vpxor %xmm0,%xmm0,%xmm0 + vmovdqa64 %zmm0,0(%rsp) + vmovdqa64 %zmm0,64(%rsp) + vmovdqa64 %zmm0,128(%rsp) + vmovdqa64 %zmm0,192(%rsp) + vmovdqa64 %zmm0,256(%rsp) + vmovdqa64 %zmm0,320(%rsp) + vmovdqa64 %zmm0,384(%rsp) + vmovdqa64 %zmm0,448(%rsp) + vmovdqa64 %zmm0,512(%rsp) + vmovdqa64 %zmm0,576(%rsp) + vmovdqa64 %zmm0,640(%rsp) + vmovdqa64 %zmm0,704(%rsp) +.Lskip_hkeys_cleanup_cdrboBdzwmggbeq: + vzeroupper + leaq (%rbp),%rsp +.cfi_def_cfa_register %rsp + popq %r15 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r15 + popq %r14 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r14 + popq %r13 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r13 + popq %r12 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r12 + popq %rbp +.cfi_adjust_cfa_offset -8 +.cfi_restore %rbp + popq %rbx +.cfi_adjust_cfa_offset -8 +.cfi_restore %rbx + .byte 0xf3,0xc3 +.Ldecrypt_seh_end: +.cfi_endproc +.size ossl_aes_gcm_decrypt_avx512, .-ossl_aes_gcm_decrypt_avx512 +.globl ossl_aes_gcm_finalize_avx512 +.type ossl_aes_gcm_finalize_avx512,@function +.align 32 +ossl_aes_gcm_finalize_avx512: +.cfi_startproc +.byte 243,15,30,250 + vmovdqu 336(%rdi),%xmm2 + vmovdqu 32(%rdi),%xmm3 + vmovdqu 64(%rdi),%xmm4 + + + cmpq $0,%rsi + je .L_partial_done_sAyBcyeiDCmpxul + + vpclmulqdq $0x11,%xmm2,%xmm4,%xmm0 + vpclmulqdq $0x00,%xmm2,%xmm4,%xmm16 + vpclmulqdq $0x01,%xmm2,%xmm4,%xmm17 + vpclmulqdq $0x10,%xmm2,%xmm4,%xmm4 + vpxorq %xmm17,%xmm4,%xmm4 + + vpsrldq $8,%xmm4,%xmm17 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm17,%xmm0,%xmm0 + vpxorq %xmm16,%xmm4,%xmm4 + + + + vmovdqu64 POLY2(%rip),%xmm17 + + vpclmulqdq $0x01,%xmm4,%xmm17,%xmm16 + vpslldq $8,%xmm16,%xmm16 + vpxorq %xmm16,%xmm4,%xmm4 + + + + vpclmulqdq $0x00,%xmm4,%xmm17,%xmm16 + vpsrldq $4,%xmm16,%xmm16 + vpclmulqdq $0x10,%xmm4,%xmm17,%xmm4 + vpslldq $4,%xmm4,%xmm4 + + vpternlogq $0x96,%xmm16,%xmm0,%xmm4 + +.L_partial_done_sAyBcyeiDCmpxul: + vmovq 56(%rdi),%xmm5 + vpinsrq $1,48(%rdi),%xmm5,%xmm5 + vpsllq $3,%xmm5,%xmm5 + + vpxor %xmm5,%xmm4,%xmm4 + + vpclmulqdq $0x11,%xmm2,%xmm4,%xmm0 + vpclmulqdq $0x00,%xmm2,%xmm4,%xmm16 + vpclmulqdq $0x01,%xmm2,%xmm4,%xmm17 + vpclmulqdq $0x10,%xmm2,%xmm4,%xmm4 + vpxorq %xmm17,%xmm4,%xmm4 + + vpsrldq $8,%xmm4,%xmm17 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm17,%xmm0,%xmm0 + vpxorq %xmm16,%xmm4,%xmm4 + + + + vmovdqu64 POLY2(%rip),%xmm17 + + vpclmulqdq $0x01,%xmm4,%xmm17,%xmm16 + vpslldq $8,%xmm16,%xmm16 + vpxorq %xmm16,%xmm4,%xmm4 + + + + vpclmulqdq $0x00,%xmm4,%xmm17,%xmm16 + vpsrldq $4,%xmm16,%xmm16 + vpclmulqdq $0x10,%xmm4,%xmm17,%xmm4 + vpslldq $4,%xmm4,%xmm4 + + vpternlogq $0x96,%xmm16,%xmm0,%xmm4 + + vpshufb SHUF_MASK(%rip),%xmm4,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + +.L_return_T_sAyBcyeiDCmpxul: + vmovdqu %xmm3,64(%rdi) +.Labort_finalize: + .byte 0xf3,0xc3 +.cfi_endproc +.size ossl_aes_gcm_finalize_avx512, .-ossl_aes_gcm_finalize_avx512 +.globl ossl_gcm_gmult_avx512 +.hidden ossl_gcm_gmult_avx512 +.type ossl_gcm_gmult_avx512,@function +.align 32 +ossl_gcm_gmult_avx512: +.cfi_startproc +.byte 243,15,30,250 + vmovdqu64 (%rdi),%xmm1 + vmovdqu64 336(%rsi),%xmm2 + + vpclmulqdq $0x11,%xmm2,%xmm1,%xmm3 + vpclmulqdq $0x00,%xmm2,%xmm1,%xmm4 + vpclmulqdq $0x01,%xmm2,%xmm1,%xmm5 + vpclmulqdq $0x10,%xmm2,%xmm1,%xmm1 + vpxorq %xmm5,%xmm1,%xmm1 + + vpsrldq $8,%xmm1,%xmm5 + vpslldq $8,%xmm1,%xmm1 + vpxorq %xmm5,%xmm3,%xmm3 + vpxorq %xmm4,%xmm1,%xmm1 + + + + vmovdqu64 POLY2(%rip),%xmm5 + + vpclmulqdq $0x01,%xmm1,%xmm5,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm1,%xmm1 + + + + vpclmulqdq $0x00,%xmm1,%xmm5,%xmm4 + vpsrldq $4,%xmm4,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm5,%xmm1 + vpslldq $4,%xmm1,%xmm1 + + vpternlogq $0x96,%xmm4,%xmm3,%xmm1 + + vmovdqu64 %xmm1,(%rdi) + vzeroupper +.Labort_gmult: + .byte 0xf3,0xc3 +.cfi_endproc +.size ossl_gcm_gmult_avx512, .-ossl_gcm_gmult_avx512 +.data +.align 16 +POLY:.quad 0x0000000000000001, 0xC200000000000000 + +.align 64 +POLY2: +.quad 0x00000001C2000000, 0xC200000000000000 +.quad 0x00000001C2000000, 0xC200000000000000 +.quad 0x00000001C2000000, 0xC200000000000000 +.quad 0x00000001C2000000, 0xC200000000000000 + +.align 16 +TWOONE:.quad 0x0000000000000001, 0x0000000100000000 + + + +.align 64 +SHUF_MASK: +.quad 0x08090A0B0C0D0E0F, 0x0001020304050607 +.quad 0x08090A0B0C0D0E0F, 0x0001020304050607 +.quad 0x08090A0B0C0D0E0F, 0x0001020304050607 +.quad 0x08090A0B0C0D0E0F, 0x0001020304050607 + +.align 16 +SHIFT_MASK: +.quad 0x0706050403020100, 0x0f0e0d0c0b0a0908 + +ALL_F: +.quad 0xffffffffffffffff, 0xffffffffffffffff + +ZERO: +.quad 0x0000000000000000, 0x0000000000000000 + +.align 16 +ONE: +.quad 0x0000000000000001, 0x0000000000000000 + +.align 16 +ONEf: +.quad 0x0000000000000000, 0x0100000000000000 + +.align 64 +ddq_add_1234: +.quad 0x0000000000000001, 0x0000000000000000 +.quad 0x0000000000000002, 0x0000000000000000 +.quad 0x0000000000000003, 0x0000000000000000 +.quad 0x0000000000000004, 0x0000000000000000 + +.align 64 +ddq_add_5678: +.quad 0x0000000000000005, 0x0000000000000000 +.quad 0x0000000000000006, 0x0000000000000000 +.quad 0x0000000000000007, 0x0000000000000000 +.quad 0x0000000000000008, 0x0000000000000000 + +.align 64 +ddq_add_4444: +.quad 0x0000000000000004, 0x0000000000000000 +.quad 0x0000000000000004, 0x0000000000000000 +.quad 0x0000000000000004, 0x0000000000000000 +.quad 0x0000000000000004, 0x0000000000000000 + +.align 64 +ddq_add_8888: +.quad 0x0000000000000008, 0x0000000000000000 +.quad 0x0000000000000008, 0x0000000000000000 +.quad 0x0000000000000008, 0x0000000000000000 +.quad 0x0000000000000008, 0x0000000000000000 + +.align 64 +ddq_addbe_1234: +.quad 0x0000000000000000, 0x0100000000000000 +.quad 0x0000000000000000, 0x0200000000000000 +.quad 0x0000000000000000, 0x0300000000000000 +.quad 0x0000000000000000, 0x0400000000000000 + +.align 64 +ddq_addbe_4444: +.quad 0x0000000000000000, 0x0400000000000000 +.quad 0x0000000000000000, 0x0400000000000000 +.quad 0x0000000000000000, 0x0400000000000000 +.quad 0x0000000000000000, 0x0400000000000000 + +.align 64 +byte_len_to_mask_table: +.value 0x0000, 0x0001, 0x0003, 0x0007 +.value 0x000f, 0x001f, 0x003f, 0x007f +.value 0x00ff, 0x01ff, 0x03ff, 0x07ff +.value 0x0fff, 0x1fff, 0x3fff, 0x7fff +.value 0xffff + +.align 64 +byte64_len_to_mask_table: +.quad 0x0000000000000000, 0x0000000000000001 +.quad 0x0000000000000003, 0x0000000000000007 +.quad 0x000000000000000f, 0x000000000000001f +.quad 0x000000000000003f, 0x000000000000007f +.quad 0x00000000000000ff, 0x00000000000001ff +.quad 0x00000000000003ff, 0x00000000000007ff +.quad 0x0000000000000fff, 0x0000000000001fff +.quad 0x0000000000003fff, 0x0000000000007fff +.quad 0x000000000000ffff, 0x000000000001ffff +.quad 0x000000000003ffff, 0x000000000007ffff +.quad 0x00000000000fffff, 0x00000000001fffff +.quad 0x00000000003fffff, 0x00000000007fffff +.quad 0x0000000000ffffff, 0x0000000001ffffff +.quad 0x0000000003ffffff, 0x0000000007ffffff +.quad 0x000000000fffffff, 0x000000001fffffff +.quad 0x000000003fffffff, 0x000000007fffffff +.quad 0x00000000ffffffff, 0x00000001ffffffff +.quad 0x00000003ffffffff, 0x00000007ffffffff +.quad 0x0000000fffffffff, 0x0000001fffffffff +.quad 0x0000003fffffffff, 0x0000007fffffffff +.quad 0x000000ffffffffff, 0x000001ffffffffff +.quad 0x000003ffffffffff, 0x000007ffffffffff +.quad 0x00000fffffffffff, 0x00001fffffffffff +.quad 0x00003fffffffffff, 0x00007fffffffffff +.quad 0x0000ffffffffffff, 0x0001ffffffffffff +.quad 0x0003ffffffffffff, 0x0007ffffffffffff +.quad 0x000fffffffffffff, 0x001fffffffffffff +.quad 0x003fffffffffffff, 0x007fffffffffffff +.quad 0x00ffffffffffffff, 0x01ffffffffffffff +.quad 0x03ffffffffffffff, 0x07ffffffffffffff +.quad 0x0fffffffffffffff, 0x1fffffffffffffff +.quad 0x3fffffffffffffff, 0x7fffffffffffffff +.quad 0xffffffffffffffff + .section ".note.gnu.property", "a" + .p2align 3 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + # "GNU" encoded with .byte, since .asciz isn't supported + # on Solaris. + .byte 0x47 + .byte 0x4e + .byte 0x55 + .byte 0 +1: + .p2align 3 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 3 +4: diff --git a/sys/crypto/openssl/amd64/ossl_aes_gcm.c b/sys/crypto/openssl/amd64/ossl_aes_gcm.c new file mode 100644 index 000000000000..3381d35557f2 --- /dev/null +++ b/sys/crypto/openssl/amd64/ossl_aes_gcm.c @@ -0,0 +1,233 @@ +/* + * Copyright 2010-2022 The OpenSSL Project Authors. All Rights Reserved. + * Copyright (c) 2021, Intel Corporation. All Rights Reserved. + * + * Licensed under the Apache License 2.0 (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + +/* + * This file contains a AES-GCM wrapper implementation from OpenSSL 3.1, + * targeting amd64 VAES extensions. This was ported from + * cipher_aes_gcm_hw_vaes_avx512.inc. + */ + +#include +#include + +#include +#include +#include + +#include + +_Static_assert( + sizeof(struct ossl_gcm_context) <= sizeof(struct ossl_cipher_context), + "ossl_gcm_context too large"); + +void aesni_set_encrypt_key(const void *key, int bits, void *ctx); + +static void +gcm_init(struct ossl_gcm_context *ctx, const void *key, size_t keylen) +{ + KASSERT(keylen == 128 || keylen == 192 || keylen == 256, + ("%s: invalid key length %zu", __func__, keylen)); + + memset(&ctx->gcm, 0, sizeof(ctx->gcm)); + memset(&ctx->aes_ks, 0, sizeof(ctx->aes_ks)); + aesni_set_encrypt_key(key, keylen, &ctx->aes_ks); + ctx->ops->init(ctx, key, keylen); +} + +static void +gcm_tag(struct ossl_gcm_context *ctx, unsigned char *tag, size_t len) +{ + (void)ctx->ops->finish(ctx, NULL, 0); + memcpy(tag, ctx->gcm.Xi.c, len); +} + +void ossl_gcm_gmult_avx512(uint64_t Xi[2], void *gcm128ctx); +void ossl_aes_gcm_init_avx512(const void *ks, void *gcm128ctx); +void ossl_aes_gcm_setiv_avx512(const void *ks, void *gcm128ctx, + const unsigned char *iv, size_t ivlen); +void ossl_aes_gcm_update_aad_avx512(void *gcm128ctx, const unsigned char *aad, + size_t len); +void ossl_aes_gcm_encrypt_avx512(const void *ks, void *gcm128ctx, + unsigned int *pblocklen, const unsigned char *in, size_t len, + unsigned char *out); +void ossl_aes_gcm_decrypt_avx512(const void *ks, void *gcm128ctx, + unsigned int *pblocklen, const unsigned char *in, size_t len, + unsigned char *out); +void ossl_aes_gcm_finalize_avx512(void *gcm128ctx, unsigned int pblocklen); + +static void +gcm_init_avx512(struct ossl_gcm_context *ctx, const void *key, size_t keylen) +{ + ossl_aes_gcm_init_avx512(&ctx->aes_ks, &ctx->gcm); +} + +static void +gcm_setiv_avx512(struct ossl_gcm_context *ctx, const unsigned char *iv, + size_t len) +{ + KASSERT(len == AES_GCM_IV_LEN, + ("%s: invalid IV length %zu", __func__, len)); + + ctx->gcm.Yi.u[0] = 0; /* Current counter */ + ctx->gcm.Yi.u[1] = 0; + ctx->gcm.Xi.u[0] = 0; /* AAD hash */ + ctx->gcm.Xi.u[1] = 0; + ctx->gcm.len.u[0] = 0; /* AAD length */ + ctx->gcm.len.u[1] = 0; /* Message length */ + ctx->gcm.ares = 0; + ctx->gcm.mres = 0; + + ossl_aes_gcm_setiv_avx512(&ctx->aes_ks, ctx, iv, len); +} + +static int +gcm_aad_avx512(struct ossl_gcm_context *ctx, const unsigned char *aad, + size_t len) +{ + uint64_t alen = ctx->gcm.len.u[0]; + size_t lenblks; + unsigned int ares; + + /* Bad sequence: call of AAD update after message processing */ + if (ctx->gcm.len.u[1]) + return -2; + + alen += len; + /* AAD is limited by 2^64 bits, thus 2^61 bytes */ + if (alen > (1ull << 61) || (sizeof(len) == 8 && alen < len)) + return -1; + ctx->gcm.len.u[0] = alen; + + ares = ctx->gcm.ares; + /* Partial AAD block left from previous AAD update calls */ + if (ares > 0) { + /* + * Fill partial block buffer till full block + * (note, the hash is stored reflected) + */ + while (ares > 0 && len > 0) { + ctx->gcm.Xi.c[15 - ares] ^= *(aad++); + --len; + ares = (ares + 1) % AES_BLOCK_LEN; + } + /* Full block gathered */ + if (ares == 0) { + ossl_gcm_gmult_avx512(ctx->gcm.Xi.u, ctx); + } else { /* no more AAD */ + ctx->gcm.ares = ares; + return 0; + } + } + + /* Bulk AAD processing */ + lenblks = len & ((size_t)(-AES_BLOCK_LEN)); + if (lenblks > 0) { + ossl_aes_gcm_update_aad_avx512(ctx, aad, lenblks); + aad += lenblks; + len -= lenblks; + } + + /* Add remaining AAD to the hash (note, the hash is stored reflected) */ + if (len > 0) { + ares = (unsigned int)len; + for (size_t i = 0; i < len; ++i) + ctx->gcm.Xi.c[15 - i] ^= aad[i]; + } + + ctx->gcm.ares = ares; + + return 0; +} + +static int +_gcm_encrypt_avx512(struct ossl_gcm_context *ctx, const unsigned char *in, + unsigned char *out, size_t len, bool encrypt) +{ + uint64_t mlen = ctx->gcm.len.u[1]; + + mlen += len; + if (mlen > ((1ull << 36) - 32) || (sizeof(len) == 8 && mlen < len)) + return -1; + + ctx->gcm.len.u[1] = mlen; + + /* Finalize GHASH(AAD) if AAD partial blocks left unprocessed */ + if (ctx->gcm.ares > 0) { + ossl_gcm_gmult_avx512(ctx->gcm.Xi.u, ctx); + ctx->gcm.ares = 0; + } + + if (encrypt) { + ossl_aes_gcm_encrypt_avx512(&ctx->aes_ks, ctx, &ctx->gcm.mres, + in, len, out); + } else { + ossl_aes_gcm_decrypt_avx512(&ctx->aes_ks, ctx, &ctx->gcm.mres, + in, len, out); + } + + return 0; +} + +static int +gcm_encrypt_avx512(struct ossl_gcm_context *ctx, const unsigned char *in, + unsigned char *out, size_t len) +{ + return _gcm_encrypt_avx512(ctx, in, out, len, true); +} + +static int +gcm_decrypt_avx512(struct ossl_gcm_context *ctx, const unsigned char *in, + unsigned char *out, size_t len) +{ + return _gcm_encrypt_avx512(ctx, in, out, len, false); +} + +static int +gcm_finish_avx512(struct ossl_gcm_context *ctx, const unsigned char *tag, + size_t len) +{ + unsigned int *res = &ctx->gcm.mres; + + /* Finalize AAD processing */ + if (ctx->gcm.ares > 0) + res = &ctx->gcm.ares; + + ossl_aes_gcm_finalize_avx512(ctx, *res); + + ctx->gcm.ares = ctx->gcm.mres = 0; + + if (tag != NULL) + return timingsafe_bcmp(ctx->gcm.Xi.c, tag, len); + return 0; +} + +static const struct ossl_aes_gcm_ops gcm_ops_avx512 = { + .init = gcm_init_avx512, + .setiv = gcm_setiv_avx512, + .aad = gcm_aad_avx512, + .encrypt = gcm_encrypt_avx512, + .decrypt = gcm_decrypt_avx512, + .finish = gcm_finish_avx512, + .tag = gcm_tag, +}; + +int ossl_aes_gcm_setkey_avx512(const unsigned char *key, int klen, void *_ctx); + +int +ossl_aes_gcm_setkey_avx512(const unsigned char *key, int klen, + void *_ctx) +{ + struct ossl_gcm_context *ctx; + + ctx = _ctx; + ctx->ops = &gcm_ops_avx512; + gcm_init(ctx, key, klen); + return (0); +} diff --git a/sys/crypto/openssl/ossl.c b/sys/crypto/openssl/ossl.c index 9c3465b264b7..723d1a80543d 100644 --- a/sys/crypto/openssl/ossl.c +++ b/sys/crypto/openssl/ossl.c @@ -1,442 +1,486 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2020 Netflix, Inc * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce at minimum a disclaimer * similar to the "NO WARRANTY" disclaimer below ("Disclaimer") and any * redistribution must be conditioned upon including a substantially * similar Disclaimer requirement for further binary redistribution. * * NO WARRANTY * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF NONINFRINGEMENT, MERCHANTIBILITY * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL * THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGES. */ /* * A driver for the OpenCrypto framework which uses assembly routines * from OpenSSL. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include "cryptodev_if.h" static MALLOC_DEFINE(M_OSSL, "ossl", "OpenSSL crypto"); static void ossl_identify(driver_t *driver, device_t parent) { if (device_find_child(parent, "ossl", -1) == NULL) BUS_ADD_CHILD(parent, 10, "ossl", -1); } static int ossl_probe(device_t dev) { device_set_desc(dev, "OpenSSL crypto"); return (BUS_PROBE_DEFAULT); } static int ossl_attach(device_t dev) { struct ossl_softc *sc; sc = device_get_softc(dev); + sc->has_aes = sc->has_aes_gcm = false; + ossl_cpuid(sc); sc->sc_cid = crypto_get_driverid(dev, sizeof(struct ossl_session), CRYPTOCAP_F_SOFTWARE | CRYPTOCAP_F_SYNC | CRYPTOCAP_F_ACCEL_SOFTWARE); if (sc->sc_cid < 0) { device_printf(dev, "failed to allocate crypto driver id\n"); return (ENXIO); } return (0); } static int ossl_detach(device_t dev) { struct ossl_softc *sc; sc = device_get_softc(dev); crypto_unregister_all(sc->sc_cid); return (0); } static struct auth_hash * ossl_lookup_hash(const struct crypto_session_params *csp) { switch (csp->csp_auth_alg) { case CRYPTO_SHA1: case CRYPTO_SHA1_HMAC: return (&ossl_hash_sha1); case CRYPTO_SHA2_224: case CRYPTO_SHA2_224_HMAC: return (&ossl_hash_sha224); case CRYPTO_SHA2_256: case CRYPTO_SHA2_256_HMAC: return (&ossl_hash_sha256); case CRYPTO_SHA2_384: case CRYPTO_SHA2_384_HMAC: return (&ossl_hash_sha384); case CRYPTO_SHA2_512: case CRYPTO_SHA2_512_HMAC: return (&ossl_hash_sha512); case CRYPTO_POLY1305: return (&ossl_hash_poly1305); default: return (NULL); } } static struct ossl_cipher* ossl_lookup_cipher(const struct crypto_session_params *csp) { switch (csp->csp_cipher_alg) { case CRYPTO_AES_CBC: switch (csp->csp_cipher_klen * 8) { case 128: case 192: case 256: break; default: return (NULL); } return (&ossl_cipher_aes_cbc); + case CRYPTO_AES_NIST_GCM_16: + switch (csp->csp_cipher_klen * 8) { + case 128: + case 192: + case 256: + break; + default: + return (NULL); + } + return (&ossl_cipher_aes_gcm); case CRYPTO_CHACHA20: if (csp->csp_cipher_klen != CHACHA_KEY_SIZE) return (NULL); return (&ossl_cipher_chacha20); default: return (NULL); } } static int ossl_probesession(device_t dev, const struct crypto_session_params *csp) { struct ossl_softc *sc = device_get_softc(dev); if ((csp->csp_flags & ~(CSP_F_SEPARATE_OUTPUT | CSP_F_SEPARATE_AAD)) != 0) return (EINVAL); switch (csp->csp_mode) { case CSP_MODE_DIGEST: if (ossl_lookup_hash(csp) == NULL) return (EINVAL); break; case CSP_MODE_CIPHER: if (csp->csp_cipher_alg != CRYPTO_CHACHA20 && !sc->has_aes) return (EINVAL); if (ossl_lookup_cipher(csp) == NULL) return (EINVAL); break; case CSP_MODE_ETA: if (!sc->has_aes || csp->csp_cipher_alg == CRYPTO_CHACHA20 || ossl_lookup_hash(csp) == NULL || ossl_lookup_cipher(csp) == NULL) return (EINVAL); break; case CSP_MODE_AEAD: switch (csp->csp_cipher_alg) { case CRYPTO_CHACHA20_POLY1305: break; + case CRYPTO_AES_NIST_GCM_16: + if (!sc->has_aes_gcm || ossl_lookup_cipher(csp) == NULL) + return (EINVAL); + if (csp->csp_ivlen != AES_GCM_IV_LEN) + return (EINVAL); + if (csp->csp_auth_mlen != 0 && + csp->csp_auth_mlen != GMAC_DIGEST_LEN) + return (EINVAL); + break; default: return (EINVAL); } break; default: return (EINVAL); } return (CRYPTODEV_PROBE_ACCEL_SOFTWARE); } static void ossl_newsession_hash(struct ossl_session *s, const struct crypto_session_params *csp) { struct auth_hash *axf; axf = ossl_lookup_hash(csp); s->hash.axf = axf; if (csp->csp_auth_mlen == 0) s->hash.mlen = axf->hashsize; else s->hash.mlen = csp->csp_auth_mlen; if (csp->csp_auth_klen == 0) { axf->Init(&s->hash.ictx); } else { if (csp->csp_auth_key != NULL) { fpu_kern_enter(curthread, NULL, FPU_KERN_NOCTX); if (axf->Setkey != NULL) { axf->Init(&s->hash.ictx); axf->Setkey(&s->hash.ictx, csp->csp_auth_key, csp->csp_auth_klen); } else { hmac_init_ipad(axf, csp->csp_auth_key, csp->csp_auth_klen, &s->hash.ictx); hmac_init_opad(axf, csp->csp_auth_key, csp->csp_auth_klen, &s->hash.octx); } fpu_kern_leave(curthread, NULL); } } } static int ossl_newsession_cipher(struct ossl_session *s, const struct crypto_session_params *csp) { struct ossl_cipher *cipher; int error = 0; cipher = ossl_lookup_cipher(csp); if (cipher == NULL) return (EINVAL); s->cipher.cipher = cipher; if (csp->csp_cipher_key == NULL) return (0); fpu_kern_enter(curthread, NULL, FPU_KERN_NOCTX); if (cipher->set_encrypt_key != NULL) { error = cipher->set_encrypt_key(csp->csp_cipher_key, 8 * csp->csp_cipher_klen, &s->cipher.enc_ctx); if (error != 0) { fpu_kern_leave(curthread, NULL); return (error); } } if (cipher->set_decrypt_key != NULL) error = cipher->set_decrypt_key(csp->csp_cipher_key, 8 * csp->csp_cipher_klen, &s->cipher.dec_ctx); fpu_kern_leave(curthread, NULL); return (error); } static int ossl_newsession(device_t dev, crypto_session_t cses, const struct crypto_session_params *csp) { struct ossl_session *s; int error = 0; s = crypto_get_driver_session(cses); switch (csp->csp_mode) { case CSP_MODE_DIGEST: ossl_newsession_hash(s, csp); break; case CSP_MODE_CIPHER: error = ossl_newsession_cipher(s, csp); break; case CSP_MODE_ETA: ossl_newsession_hash(s, csp); error = ossl_newsession_cipher(s, csp); break; + case CSP_MODE_AEAD: + error = ossl_newsession_cipher(s, csp); + break; + default: + __assert_unreachable(); } return (error); } static int ossl_process_hash(struct ossl_session *s, struct cryptop *crp, const struct crypto_session_params *csp) { struct ossl_hash_context ctx; char digest[HASH_MAX_LEN]; struct auth_hash *axf; int error; axf = s->hash.axf; if (crp->crp_auth_key == NULL) { ctx = s->hash.ictx; } else { if (axf->Setkey != NULL) { axf->Init(&ctx); axf->Setkey(&ctx, crp->crp_auth_key, csp->csp_auth_klen); } else { hmac_init_ipad(axf, crp->crp_auth_key, csp->csp_auth_klen, &ctx); } } if (crp->crp_aad != NULL) error = axf->Update(&ctx, crp->crp_aad, crp->crp_aad_length); else error = crypto_apply(crp, crp->crp_aad_start, crp->crp_aad_length, axf->Update, &ctx); if (error) goto out; error = crypto_apply(crp, crp->crp_payload_start, crp->crp_payload_length, axf->Update, &ctx); if (error) goto out; axf->Final(digest, &ctx); if (csp->csp_auth_klen != 0 && axf->Setkey == NULL) { if (crp->crp_auth_key == NULL) ctx = s->hash.octx; else hmac_init_opad(axf, crp->crp_auth_key, csp->csp_auth_klen, &ctx); axf->Update(&ctx, digest, axf->hashsize); axf->Final(digest, &ctx); } if (crp->crp_op & CRYPTO_OP_VERIFY_DIGEST) { char digest2[HASH_MAX_LEN]; crypto_copydata(crp, crp->crp_digest_start, s->hash.mlen, digest2); if (timingsafe_bcmp(digest, digest2, s->hash.mlen) != 0) error = EBADMSG; explicit_bzero(digest2, sizeof(digest2)); } else { crypto_copyback(crp, crp->crp_digest_start, s->hash.mlen, digest); } explicit_bzero(digest, sizeof(digest)); out: explicit_bzero(&ctx, sizeof(ctx)); return (error); } +static int +ossl_process_cipher(struct ossl_session *s, struct cryptop *crp, + const struct crypto_session_params *csp) +{ + return (s->cipher.cipher->process(&s->cipher, crp, csp)); +} + static int ossl_process_eta(struct ossl_session *s, struct cryptop *crp, const struct crypto_session_params *csp) { int error; if (CRYPTO_OP_IS_ENCRYPT(crp->crp_op)) { error = s->cipher.cipher->process(&s->cipher, crp, csp); if (error == 0) error = ossl_process_hash(s, crp, csp); } else { error = ossl_process_hash(s, crp, csp); if (error == 0) error = s->cipher.cipher->process(&s->cipher, crp, csp); } return (error); } +static int +ossl_process_aead(struct ossl_session *s, struct cryptop *crp, + const struct crypto_session_params *csp) +{ + if (csp->csp_cipher_alg == CRYPTO_CHACHA20_POLY1305) { + if (CRYPTO_OP_IS_ENCRYPT(crp->crp_op)) + return (ossl_chacha20_poly1305_encrypt(crp, csp)); + else + return (ossl_chacha20_poly1305_decrypt(crp, csp)); + } else { + return (s->cipher.cipher->process(&s->cipher, crp, csp)); + } +} + static int ossl_process(device_t dev, struct cryptop *crp, int hint) { const struct crypto_session_params *csp; struct ossl_session *s; int error; bool fpu_entered; s = crypto_get_driver_session(crp->crp_session); csp = crypto_get_params(crp->crp_session); if (is_fpu_kern_thread(0)) { fpu_entered = false; } else { fpu_kern_enter(curthread, NULL, FPU_KERN_NOCTX); fpu_entered = true; } switch (csp->csp_mode) { case CSP_MODE_DIGEST: error = ossl_process_hash(s, crp, csp); break; case CSP_MODE_CIPHER: - error = s->cipher.cipher->process(&s->cipher, crp, csp); + error = ossl_process_cipher(s, crp, csp); break; case CSP_MODE_ETA: error = ossl_process_eta(s, crp, csp); break; case CSP_MODE_AEAD: - if (CRYPTO_OP_IS_ENCRYPT(crp->crp_op)) - error = ossl_chacha20_poly1305_encrypt(crp, csp); - else - error = ossl_chacha20_poly1305_decrypt(crp, csp); + error = ossl_process_aead(s, crp, csp); break; default: __assert_unreachable(); } if (fpu_entered) fpu_kern_leave(curthread, NULL); crp->crp_etype = error; crypto_done(crp); return (0); } static device_method_t ossl_methods[] = { DEVMETHOD(device_identify, ossl_identify), DEVMETHOD(device_probe, ossl_probe), DEVMETHOD(device_attach, ossl_attach), DEVMETHOD(device_detach, ossl_detach), DEVMETHOD(cryptodev_probesession, ossl_probesession), DEVMETHOD(cryptodev_newsession, ossl_newsession), DEVMETHOD(cryptodev_process, ossl_process), DEVMETHOD_END }; static driver_t ossl_driver = { "ossl", ossl_methods, sizeof(struct ossl_softc) }; DRIVER_MODULE(ossl, nexus, ossl_driver, NULL, NULL); MODULE_VERSION(ossl, 1); MODULE_DEPEND(ossl, crypto, 1, 1, 1); diff --git a/sys/crypto/openssl/ossl.h b/sys/crypto/openssl/ossl.h index 4f5353818add..3b9313251cff 100644 --- a/sys/crypto/openssl/ossl.h +++ b/sys/crypto/openssl/ossl.h @@ -1,90 +1,92 @@ /* * Copyright (c) 2020 Netflix, Inc * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce at minimum a disclaimer * similar to the "NO WARRANTY" disclaimer below ("Disclaimer") and any * redistribution must be conditioned upon including a substantially * similar Disclaimer requirement for further binary redistribution. * * NO WARRANTY * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF NONINFRINGEMENT, MERCHANTIBILITY * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL * THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGES. * * $FreeBSD$ */ #ifndef __OSSL_H__ #define __OSSL_H__ /* Compatibility shims. */ #define OPENSSL_cleanse explicit_bzero struct cryptop; struct crypto_session_params; struct ossl_softc; struct ossl_session; int ossl_chacha20_poly1305_decrypt(struct cryptop *crp, const struct crypto_session_params *csp); int ossl_chacha20_poly1305_encrypt(struct cryptop *crp, const struct crypto_session_params *csp); void ossl_cpuid(struct ossl_softc *sc); struct ossl_softc { int32_t sc_cid; bool has_aes; + bool has_aes_gcm; }; /* Needs to be big enough to hold any hash context. */ struct ossl_hash_context { - uint32_t dummy[61]; + uint32_t dummy[196]; } __aligned(32); struct ossl_cipher_context { - uint32_t dummy[61]; + uint32_t dummy[196]; } __aligned(32); struct ossl_session_hash { struct ossl_hash_context ictx; struct ossl_hash_context octx; struct auth_hash *axf; u_int mlen; }; struct ossl_session_cipher { struct ossl_cipher_context dec_ctx; struct ossl_cipher_context enc_ctx; struct ossl_cipher *cipher; }; struct ossl_session { struct ossl_session_cipher cipher; struct ossl_session_hash hash; }; extern struct auth_hash ossl_hash_poly1305; extern struct auth_hash ossl_hash_sha1; extern struct auth_hash ossl_hash_sha224; extern struct auth_hash ossl_hash_sha256; extern struct auth_hash ossl_hash_sha384; extern struct auth_hash ossl_hash_sha512; extern struct ossl_cipher ossl_cipher_aes_cbc; +extern struct ossl_cipher ossl_cipher_aes_gcm; extern struct ossl_cipher ossl_cipher_chacha20; #endif /* !__OSSL_H__ */ diff --git a/sys/crypto/openssl/ossl_aes.c b/sys/crypto/openssl/ossl_aes.c index 382fa80cc56b..93d3ac3f2a99 100644 --- a/sys/crypto/openssl/ossl_aes.c +++ b/sys/crypto/openssl/ossl_aes.c @@ -1,153 +1,256 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2021 Stormshield. * Copyright (c) 2021 Semihalf. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include +#include #include +#include #include #if defined(__amd64__) || defined(__i386__) #include #elif defined (__aarch64__) #include #endif static ossl_cipher_process_t ossl_aes_cbc; +static ossl_cipher_process_t ossl_aes_gcm; struct ossl_cipher ossl_cipher_aes_cbc = { .type = CRYPTO_AES_CBC, .blocksize = AES_BLOCK_LEN, .ivsize = AES_BLOCK_LEN, /* Filled during initialization based on CPU caps. */ .set_encrypt_key = NULL, .set_decrypt_key = NULL, .process = ossl_aes_cbc }; +struct ossl_cipher ossl_cipher_aes_gcm = { + .type = CRYPTO_AES_NIST_GCM_16, + .blocksize = 1, + .ivsize = AES_GCM_IV_LEN, + + /* Filled during initialization based on CPU caps. */ + .set_encrypt_key = NULL, + .set_decrypt_key = NULL, + .process = ossl_aes_gcm, +}; + static int ossl_aes_cbc(struct ossl_session_cipher *s, struct cryptop *crp, const struct crypto_session_params *csp) { struct crypto_buffer_cursor cc_in, cc_out; unsigned char block[EALG_MAX_BLOCK_LEN]; unsigned char iv[EALG_MAX_BLOCK_LEN]; const unsigned char *in, *inseg; unsigned char *out, *outseg; size_t plen, seglen, inlen, outlen; struct ossl_cipher_context key; struct ossl_cipher *cipher; int blocklen, error; bool encrypt; cipher = s->cipher; encrypt = CRYPTO_OP_IS_ENCRYPT(crp->crp_op); plen = crp->crp_payload_length; blocklen = cipher->blocksize; if (plen % blocklen) return (EINVAL); if (crp->crp_cipher_key != NULL) { if (encrypt) error = cipher->set_encrypt_key(crp->crp_cipher_key, 8 * csp->csp_cipher_klen, &key); else error = cipher->set_decrypt_key(crp->crp_cipher_key, 8 * csp->csp_cipher_klen, &key); if (error) return (error); } else { if (encrypt) key = s->enc_ctx; else key = s->dec_ctx; } crypto_read_iv(crp, iv); /* Derived from ossl_chacha20.c */ crypto_cursor_init(&cc_in, &crp->crp_buf); crypto_cursor_advance(&cc_in, crp->crp_payload_start); inseg = crypto_cursor_segment(&cc_in, &inlen); if (CRYPTO_HAS_OUTPUT_BUFFER(crp)) { crypto_cursor_init(&cc_out, &crp->crp_obuf); crypto_cursor_advance(&cc_out, crp->crp_payload_output_start); } else { cc_out = cc_in; } outseg = crypto_cursor_segment(&cc_out, &outlen); while (plen >= blocklen) { if (inlen < blocklen) { crypto_cursor_copydata(&cc_in, blocklen, block); in = block; inlen = blocklen; } else { in = inseg; } if (outlen < blocklen) { out = block; outlen = blocklen; } else { out = outseg; } /* Figure out how many blocks we can encrypt/decrypt at once. */ seglen = rounddown(MIN(plen, MIN(inlen, outlen)), blocklen); AES_CBC_ENCRYPT(in, out, seglen, &key, iv, encrypt); if (out == block) { crypto_cursor_copyback(&cc_out, blocklen, block); outseg = crypto_cursor_segment(&cc_out, &outlen); } else { crypto_cursor_advance(&cc_out, seglen); outseg += seglen; outlen -= seglen; } if (in == block) { inseg = crypto_cursor_segment(&cc_in, &inlen); } else { crypto_cursor_advance(&cc_in, seglen); inseg += seglen; inlen -= seglen; } plen -= seglen; } explicit_bzero(block, sizeof(block)); explicit_bzero(iv, sizeof(iv)); explicit_bzero(&key, sizeof(key)); return (0); } + +static int +ossl_aes_gcm(struct ossl_session_cipher *s, struct cryptop *crp, + const struct crypto_session_params *csp) +{ + struct ossl_cipher_context key; + struct crypto_buffer_cursor cc_in, cc_out; + unsigned char iv[AES_BLOCK_LEN], tag[AES_BLOCK_LEN]; + struct ossl_gcm_context *ctx; + const unsigned char *inseg; + unsigned char *outseg; + size_t inlen, outlen, seglen; + int error; + bool encrypt; + + encrypt = CRYPTO_OP_IS_ENCRYPT(crp->crp_op); + + if (crp->crp_cipher_key != NULL) { + if (encrypt) + error = s->cipher->set_encrypt_key(crp->crp_cipher_key, + 8 * csp->csp_cipher_klen, &key); + else + error = s->cipher->set_decrypt_key(crp->crp_cipher_key, + 8 * csp->csp_cipher_klen, &key); + if (error) + return (error); + ctx = (struct ossl_gcm_context *)&key; + } else if (encrypt) { + ctx = (struct ossl_gcm_context *)&s->enc_ctx; + } else { + ctx = (struct ossl_gcm_context *)&s->dec_ctx; + } + + crypto_read_iv(crp, iv); + ctx->ops->setiv(ctx, iv, csp->csp_ivlen); + + crypto_cursor_init(&cc_in, &crp->crp_buf); + crypto_cursor_advance(&cc_in, crp->crp_aad_start); + for (size_t alen = crp->crp_aad_length; alen > 0; alen -= seglen) { + inseg = crypto_cursor_segment(&cc_in, &inlen); + seglen = MIN(alen, inlen); + if (ctx->ops->aad(ctx, inseg, seglen) != 0) + return (EINVAL); + crypto_cursor_advance(&cc_in, seglen); + } + + crypto_cursor_init(&cc_in, &crp->crp_buf); + crypto_cursor_advance(&cc_in, crp->crp_payload_start); + if (CRYPTO_HAS_OUTPUT_BUFFER(crp)) { + crypto_cursor_init(&cc_out, &crp->crp_obuf); + crypto_cursor_advance(&cc_out, crp->crp_payload_output_start); + } else { + cc_out = cc_in; + } + + for (size_t plen = crp->crp_payload_length; plen > 0; plen -= seglen) { + inseg = crypto_cursor_segment(&cc_in, &inlen); + outseg = crypto_cursor_segment(&cc_out, &outlen); + seglen = MIN(plen, MIN(inlen, outlen)); + + if (encrypt) { + if (ctx->ops->encrypt(ctx, inseg, outseg, seglen) != 0) + return (EINVAL); + } else { + if (ctx->ops->decrypt(ctx, inseg, outseg, seglen) != 0) + return (EINVAL); + } + + crypto_cursor_advance(&cc_in, seglen); + crypto_cursor_advance(&cc_out, seglen); + } + + error = 0; + if (encrypt) { + ctx->ops->tag(ctx, tag, GMAC_DIGEST_LEN); + crypto_copyback(crp, crp->crp_digest_start, GMAC_DIGEST_LEN, + tag); + } else { + crypto_copydata(crp, crp->crp_digest_start, GMAC_DIGEST_LEN, + tag); + if (ctx->ops->finish(ctx, tag, GMAC_DIGEST_LEN) != 0) + error = EBADMSG; + } + + explicit_bzero(iv, sizeof(iv)); + explicit_bzero(tag, sizeof(tag)); + + return (error); +} diff --git a/sys/crypto/openssl/ossl_aes_gcm.h b/sys/crypto/openssl/ossl_aes_gcm.h new file mode 100644 index 000000000000..9ce8ee193483 --- /dev/null +++ b/sys/crypto/openssl/ossl_aes_gcm.h @@ -0,0 +1,71 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2023 Stormshield + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _OSSL_AES_GCM_H_ +#define _OSSL_AES_GCM_H_ + +#include +#include + +struct ossl_gcm_context; + +struct ossl_aes_gcm_ops { + void (*init)(struct ossl_gcm_context *ctx, const void *key, + size_t keylen); + void (*setiv)(struct ossl_gcm_context *ctx, const unsigned char *iv, + size_t ivlen); + int (*aad)(struct ossl_gcm_context *ctx, const unsigned char *aad, + size_t len); + int (*encrypt)(struct ossl_gcm_context *ctx, const unsigned char *in, + unsigned char *out, size_t len); + int (*decrypt)(struct ossl_gcm_context *ctx, const unsigned char *in, + unsigned char *out, size_t len); + int (*finish)(struct ossl_gcm_context *ctx, const unsigned char *tag, + size_t len); + void (*tag)(struct ossl_gcm_context *ctx, unsigned char *tag, + size_t len); +}; + +struct ossl_gcm_context { + struct { + union { + uint64_t u[2]; + uint32_t d[4]; + uint8_t c[16]; + } Yi, EKi, EK0, len, Xi, H; + __uint128_t Htable[16]; + unsigned int mres, ares; + } gcm; + + struct { + uint32_t ks[4 * (RIJNDAEL_MAXNR + 1)]; + int rounds; + } aes_ks; + + const struct ossl_aes_gcm_ops *ops; +}; + +#endif /* !_OSSL_AES_GCM_H_ */ diff --git a/sys/crypto/openssl/ossl_x86.c b/sys/crypto/openssl/ossl_x86.c index 75598d821506..594aee2ab97f 100644 --- a/sys/crypto/openssl/ossl_x86.c +++ b/sys/crypto/openssl/ossl_x86.c @@ -1,129 +1,148 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2020 Netflix, Inc * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce at minimum a disclaimer * similar to the "NO WARRANTY" disclaimer below ("Disclaimer") and any * redistribution must be conditioned upon including a substantially * similar Disclaimer requirement for further binary redistribution. * * NO WARRANTY * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF NONINFRINGEMENT, MERCHANTIBILITY * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL * THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGES. * * $FreeBSD$ */ #include #include #include #include #include #include #include +#include #include /* * See OPENSSL_ia32cap(3). * * [0] = cpu_feature but with a few custom bits * [1] = cpu_feature2 but with AMD XOP in bit 11 * [2] = cpu_stdext_feature * [3] = cpu_stdext_feature2 */ unsigned int OPENSSL_ia32cap_P[4]; #define AESNI_CAPABLE (OPENSSL_ia32cap_P[1]&(1<<(57-32))) ossl_cipher_setkey_t aesni_set_encrypt_key; ossl_cipher_setkey_t aesni_set_decrypt_key; +#ifdef __amd64__ +int ossl_vaes_vpclmulqdq_capable(void); +ossl_cipher_setkey_t ossl_aes_gcm_setkey_avx512; +#endif + void ossl_cpuid(struct ossl_softc *sc) { uint64_t xcr0; u_int regs[4]; u_int max_cores; /* Derived from OpenSSL_ia32_cpuid. */ OPENSSL_ia32cap_P[0] = cpu_feature & ~(CPUID_B20 | CPUID_IA64); if (cpu_vendor_id == CPU_VENDOR_INTEL) { OPENSSL_ia32cap_P[0] |= CPUID_IA64; if ((cpu_id & 0xf00) != 0xf00) OPENSSL_ia32cap_P[0] |= CPUID_B20; } /* Only leave CPUID_HTT on if HTT is present. */ if (cpu_vendor_id == CPU_VENDOR_AMD && cpu_exthigh >= 0x80000008) { max_cores = (cpu_procinfo2 & AMDID_CMP_CORES) + 1; if (cpu_feature & CPUID_HTT) { if ((cpu_procinfo & CPUID_HTT_CORES) >> 16 <= max_cores) OPENSSL_ia32cap_P[0] &= ~CPUID_HTT; } } else { if (cpu_high >= 4) { cpuid_count(4, 0, regs); max_cores = (regs[0] >> 26) & 0xfff; } else max_cores = -1; } if (max_cores == 0) OPENSSL_ia32cap_P[0] &= ~CPUID_HTT; else if ((cpu_procinfo & CPUID_HTT_CORES) >> 16 == 0) OPENSSL_ia32cap_P[0] &= ~CPUID_HTT; OPENSSL_ia32cap_P[1] = cpu_feature2 & ~AMDID2_XOP; if (cpu_vendor_id == CPU_VENDOR_AMD) OPENSSL_ia32cap_P[1] |= amd_feature2 & AMDID2_XOP; OPENSSL_ia32cap_P[2] = cpu_stdext_feature; if ((OPENSSL_ia32cap_P[1] & CPUID2_XSAVE) == 0) OPENSSL_ia32cap_P[2] &= ~(CPUID_STDEXT_AVX512F | CPUID_STDEXT_AVX512DQ); /* Disable AVX512F on Skylake-X. */ if ((cpu_id & 0x0fff0ff0) == 0x00050650) OPENSSL_ia32cap_P[2] &= ~(CPUID_STDEXT_AVX512F); if (cpu_feature2 & CPUID2_OSXSAVE) xcr0 = rxcr(0); else xcr0 = 0; if ((xcr0 & (XFEATURE_AVX512 | XFEATURE_AVX)) != (XFEATURE_AVX512 | XFEATURE_AVX)) OPENSSL_ia32cap_P[2] &= ~(CPUID_STDEXT_AVX512VL | CPUID_STDEXT_AVX512BW | CPUID_STDEXT_AVX512IFMA | CPUID_STDEXT_AVX512F); if ((xcr0 & XFEATURE_AVX) != XFEATURE_AVX) { OPENSSL_ia32cap_P[1] &= ~(CPUID2_AVX | AMDID2_XOP | CPUID2_FMA); OPENSSL_ia32cap_P[2] &= ~CPUID_STDEXT_AVX2; } OPENSSL_ia32cap_P[3] = cpu_stdext_feature2; - if (!AESNI_CAPABLE) { - sc->has_aes = false; + if (!AESNI_CAPABLE) return; - } + sc->has_aes = true; ossl_cipher_aes_cbc.set_encrypt_key = aesni_set_encrypt_key; ossl_cipher_aes_cbc.set_decrypt_key = aesni_set_decrypt_key; + +#ifdef __amd64__ + if (ossl_vaes_vpclmulqdq_capable()) { + ossl_cipher_aes_gcm.set_encrypt_key = + ossl_aes_gcm_setkey_avx512; + ossl_cipher_aes_gcm.set_decrypt_key = + ossl_aes_gcm_setkey_avx512; + sc->has_aes_gcm = true; + } else { + sc->has_aes_gcm = false; + } +#else + sc->has_aes_gcm = false; +#endif } diff --git a/sys/modules/ossl/Makefile b/sys/modules/ossl/Makefile index 765e70a03edd..d56fef428494 100644 --- a/sys/modules/ossl/Makefile +++ b/sys/modules/ossl/Makefile @@ -1,62 +1,64 @@ # $FreeBSD$ .PATH: ${SRCTOP}/sys/crypto/openssl .PATH: ${SRCTOP}/sys/crypto/openssl/${MACHINE_CPUARCH} KMOD= ossl OBJS+= ${OBJS.${MACHINE_CPUARCH}} SRCS= bus_if.h \ cryptodev_if.h \ device_if.h \ ossl.c \ ossl_aes.c \ ossl_chacha20.c \ ossl_poly1305.c \ ossl_sha1.c \ ossl_sha256.c \ ossl_sha512.c \ ${SRCS.${MACHINE_CPUARCH}} SRCS.aarch64= \ chacha-armv8.S \ poly1305-armv8.S \ sha1-armv8.S \ sha256-armv8.S \ sha512-armv8.S \ vpaes-armv8.S \ ossl_aarch64.c SRCS.amd64= \ + aes-gcm-avx512.S \ aesni-x86_64.S \ chacha-x86_64.S \ poly1305-x86_64.S \ sha1-x86_64.S \ sha256-x86_64.S \ sha512-x86_64.S \ + ossl_aes_gcm.c \ ossl_x86.c SRCS.i386= \ aesni-x86.S \ chacha-x86.S \ poly1305-x86.S \ sha1-586.S \ sha256-586.S \ sha512-586.S \ ossl_x86.c # For arm64, we are forced to rewrite the compiler invocation for the assembly # files, to remove -mgeneral-regs-only. ${SRCS.aarch64:M*.S:S/S/o/}: ${.TARGET:R}.S ${CC} -c ${CFLAGS:N-mgeneral-regs-only} ${WERROR} ${PROF} ${.IMPSRC} ${CTFCONVERT_CMD} # Based on modules/armv8crypto/Makefile. # Clang doesn't recognize "aes*" instructions without -march set. aesv8-armx.o: aesv8-armx.S ${CC} -c ${CFLAGS:N-mgeneral-regs-only} ${WERROR} ${PROF} \ -march=armv8-a+crypto ${.IMPSRC} ${CTFCONVERT_CMD} OBJS.aarch64= aesv8-armx.o .include