diff --git a/sys/crypto/openssl/aarch64/aes-gcm-armv8_64.S b/sys/crypto/openssl/aarch64/aes-gcm-armv8_64.S new file mode 100644 --- /dev/null +++ b/sys/crypto/openssl/aarch64/aes-gcm-armv8_64.S @@ -0,0 +1,6390 @@ +/* Do not modify. This file is auto-generated from aes-gcm-armv8_64.pl. */ +#include "arm_arch.h" + +#if __ARM_MAX_ARCH__>=8 +.arch armv8-a+crypto +.text +.globl aes_gcm_enc_128_kernel +.type aes_gcm_enc_128_kernel,%function +.align 4 +aes_gcm_enc_128_kernel: + cbz x1, .L128_enc_ret + stp x19, x20, [sp, #-112]! + mov x16, x4 + mov x8, x5 + stp x21, x22, [sp, #16] + stp x23, x24, [sp, #32] + stp d8, d9, [sp, #48] + stp d10, d11, [sp, #64] + stp d12, d13, [sp, #80] + stp d14, d15, [sp, #96] + + ldp x10, x11, [x16] //ctr96_b64, ctr96_t32 +#ifdef __AARCH64EB__ + rev x10, x10 + rev x11, x11 +#endif + ldp x13, x14, [x8, #160] //load rk10 +#ifdef __AARCH64EB__ + ror x13, x13, #32 + ror x14, x14, #32 +#endif + ld1 {v11.16b}, [x3] + ext v11.16b, v11.16b, v11.16b, #8 + rev64 v11.16b, v11.16b + lsr x5, x1, #3 //byte_len + mov x15, x5 + + ld1 {v18.4s}, [x8], #16 //load rk0 + add x4, x0, x1, lsr #3 //end_input_ptr + sub x5, x5, #1 //byte_len - 1 + + lsr x12, x11, #32 + ldr q15, [x3, #112] //load h4l | h4h +#ifndef __AARCH64EB__ + ext v15.16b, v15.16b, v15.16b, #8 +#endif + fmov d1, x10 //CTR block 1 + rev w12, w12 //rev_ctr32 + + add w12, w12, #1 //increment rev_ctr32 + orr w11, w11, w11 + ld1 {v19.4s}, [x8], #16 //load rk1 + + rev w9, w12 //CTR block 1 + add w12, w12, #1 //CTR block 1 + fmov d3, x10 //CTR block 3 + + orr x9, x11, x9, lsl #32 //CTR block 1 + ld1 { v0.16b}, [x16] //special case vector load initial counter so we can start first AES block as quickly as possible + + fmov v1.d[1], x9 //CTR block 1 + rev w9, w12 //CTR block 2 + + fmov d2, x10 //CTR block 2 + orr x9, x11, x9, lsl #32 //CTR block 2 + add w12, w12, #1 //CTR block 2 + + fmov v2.d[1], x9 //CTR block 2 + rev w9, w12 //CTR block 3 + + orr x9, x11, x9, lsl #32 //CTR block 3 + ld1 {v20.4s}, [x8], #16 //load rk2 + + add w12, w12, #1 //CTR block 3 + fmov v3.d[1], x9 //CTR block 3 + + ldr q14, [x3, #80] //load h3l | h3h +#ifndef __AARCH64EB__ + ext v14.16b, v14.16b, v14.16b, #8 +#endif + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b //AES block 1 - round 0 + ld1 {v21.4s}, [x8], #16 //load rk3 + + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b //AES block 2 - round 0 + ldr q12, [x3, #32] //load h1l | h1h +#ifndef __AARCH64EB__ + ext v12.16b, v12.16b, v12.16b, #8 +#endif + + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b //AES block 0 - round 0 + ld1 {v22.4s}, [x8], #16 //load rk4 + + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b //AES block 3 - round 0 + ld1 {v23.4s}, [x8], #16 //load rk5 + + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b //AES block 2 - round 1 + trn2 v17.2d, v14.2d, v15.2d //h4l | h3l + + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b //AES block 0 - round 1 + ld1 {v24.4s}, [x8], #16 //load rk6 + + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b //AES block 1 - round 1 + ld1 {v25.4s}, [x8], #16 //load rk7 + + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b //AES block 3 - round 1 + trn1 v9.2d, v14.2d, v15.2d //h4h | h3h + + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b //AES block 0 - round 2 + ld1 {v26.4s}, [x8], #16 //load rk8 + + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b //AES block 1 - round 2 + ldr q13, [x3, #64] //load h2l | h2h +#ifndef __AARCH64EB__ + ext v13.16b, v13.16b, v13.16b, #8 +#endif + + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b //AES block 3 - round 2 + + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b //AES block 2 - round 2 + eor v17.16b, v17.16b, v9.16b //h4k | h3k + + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b //AES block 0 - round 3 + + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b //AES block 1 - round 3 + + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b //AES block 2 - round 3 + ld1 {v27.4s}, [x8], #16 //load rk9 + + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b //AES block 3 - round 3 + + and x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail) + trn2 v16.2d, v12.2d, v13.2d //h2l | h1l + + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b //AES block 3 - round 4 + add x5, x5, x0 + + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b //AES block 2 - round 4 + cmp x0, x5 //check if we have <= 4 blocks + + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b //AES block 0 - round 4 + + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b //AES block 3 - round 5 + + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b //AES block 2 - round 5 + + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b //AES block 0 - round 5 + + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b //AES block 3 - round 6 + + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b //AES block 1 - round 4 + + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b //AES block 2 - round 6 + trn1 v8.2d, v12.2d, v13.2d //h2h | h1h + + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b //AES block 0 - round 6 + + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b //AES block 1 - round 5 + + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b //AES block 3 - round 7 + + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b //AES block 0 - round 7 + + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b //AES block 1 - round 6 + + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b //AES block 2 - round 7 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 0 - round 8 + + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b //AES block 1 - round 7 + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 2 - round 8 + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 3 - round 8 + + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 1 - round 8 + + aese v2.16b, v27.16b //AES block 2 - round 9 + + aese v0.16b, v27.16b //AES block 0 - round 9 + + eor v16.16b, v16.16b, v8.16b //h2k | h1k + + aese v1.16b, v27.16b //AES block 1 - round 9 + + aese v3.16b, v27.16b //AES block 3 - round 9 + b.ge .L128_enc_tail //handle tail + + ldp x6, x7, [x0, #0] //AES block 0 - load plaintext +#ifdef __AARCH64EB__ + rev x6, x6 + rev x7, x7 +#endif + ldp x21, x22, [x0, #32] //AES block 2 - load plaintext +#ifdef __AARCH64EB__ + rev x21, x21 + rev x22, x22 +#endif + ldp x19, x20, [x0, #16] //AES block 1 - load plaintext +#ifdef __AARCH64EB__ + rev x19, x19 + rev x20, x20 +#endif + ldp x23, x24, [x0, #48] //AES block 3 - load plaintext +#ifdef __AARCH64EB__ + rev x23, x23 + rev x24, x24 +#endif + eor x6, x6, x13 //AES block 0 - round 10 low + eor x7, x7, x14 //AES block 0 - round 10 high + + eor x21, x21, x13 //AES block 2 - round 10 low + fmov d4, x6 //AES block 0 - mov low + + eor x19, x19, x13 //AES block 1 - round 10 low + eor x22, x22, x14 //AES block 2 - round 10 high + fmov v4.d[1], x7 //AES block 0 - mov high + + fmov d5, x19 //AES block 1 - mov low + eor x20, x20, x14 //AES block 1 - round 10 high + + eor x23, x23, x13 //AES block 3 - round 10 low + fmov v5.d[1], x20 //AES block 1 - mov high + + fmov d6, x21 //AES block 2 - mov low + eor x24, x24, x14 //AES block 3 - round 10 high + rev w9, w12 //CTR block 4 + + fmov v6.d[1], x22 //AES block 2 - mov high + orr x9, x11, x9, lsl #32 //CTR block 4 + + eor v4.16b, v4.16b, v0.16b //AES block 0 - result + fmov d0, x10 //CTR block 4 + add w12, w12, #1 //CTR block 4 + + fmov v0.d[1], x9 //CTR block 4 + rev w9, w12 //CTR block 5 + + eor v5.16b, v5.16b, v1.16b //AES block 1 - result + fmov d1, x10 //CTR block 5 + orr x9, x11, x9, lsl #32 //CTR block 5 + + add w12, w12, #1 //CTR block 5 + add x0, x0, #64 //AES input_ptr update + fmov v1.d[1], x9 //CTR block 5 + + fmov d7, x23 //AES block 3 - mov low + rev w9, w12 //CTR block 6 + st1 { v4.16b}, [x2], #16 //AES block 0 - store result + + fmov v7.d[1], x24 //AES block 3 - mov high + orr x9, x11, x9, lsl #32 //CTR block 6 + + add w12, w12, #1 //CTR block 6 + eor v6.16b, v6.16b, v2.16b //AES block 2 - result + st1 { v5.16b}, [x2], #16 //AES block 1 - store result + + fmov d2, x10 //CTR block 6 + cmp x0, x5 //check if we have <= 8 blocks + + fmov v2.d[1], x9 //CTR block 6 + rev w9, w12 //CTR block 7 + st1 { v6.16b}, [x2], #16 //AES block 2 - store result + + orr x9, x11, x9, lsl #32 //CTR block 7 + + eor v7.16b, v7.16b, v3.16b //AES block 3 - result + st1 { v7.16b}, [x2], #16 //AES block 3 - store result + b.ge .L128_enc_prepretail //do prepretail + +.L128_enc_main_loop: //main loop start + ldp x23, x24, [x0, #48] //AES block 4k+3 - load plaintext +#ifdef __AARCH64EB__ + rev x23, x23 + rev x24, x24 +#endif + rev64 v4.16b, v4.16b //GHASH block 4k (only t0 is free) + rev64 v6.16b, v6.16b //GHASH block 4k+2 (t0, t1, and t2 free) + + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 + fmov d3, x10 //CTR block 4k+3 + + ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 + rev64 v5.16b, v5.16b //GHASH block 4k+1 (t0 and t1 free) + + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 + add w12, w12, #1 //CTR block 4k+3 + fmov v3.d[1], x9 //CTR block 4k+3 + + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 + mov d31, v6.d[1] //GHASH block 4k+2 - mid + + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 + mov d30, v5.d[1] //GHASH block 4k+1 - mid + + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 + eor v4.16b, v4.16b, v11.16b //PRE 1 + + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 + eor x24, x24, x14 //AES block 4k+3 - round 10 high + + pmull2 v28.1q, v5.2d, v14.2d //GHASH block 4k+1 - high + eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid + ldp x6, x7, [x0, #0] //AES block 4k+4 - load plaintext +#ifdef __AARCH64EB__ + rev x6, x6 + rev x7, x7 +#endif + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 + rev w9, w12 //CTR block 4k+8 + + eor v30.8b, v30.8b, v5.8b //GHASH block 4k+1 - mid + mov d8, v4.d[1] //GHASH block 4k - mid + orr x9, x11, x9, lsl #32 //CTR block 4k+8 + + pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high + add w12, w12, #1 //CTR block 4k+8 + mov d10, v17.d[1] //GHASH block 4k - mid + + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 + + pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low + eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid + + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 + + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 + eor v9.16b, v9.16b, v28.16b //GHASH block 4k+1 - high + + pmull v28.1q, v6.1d, v13.1d //GHASH block 4k+2 - low + + pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid + rev64 v7.16b, v7.16b //GHASH block 4k+3 (t0, t1, t2 and t3 free) + + pmull v30.1q, v30.1d, v17.1d //GHASH block 4k+1 - mid + + pmull v29.1q, v5.1d, v14.1d //GHASH block 4k+1 - low + ins v31.d[1], v31.d[0] //GHASH block 4k+2 - mid + + pmull2 v8.1q, v6.2d, v13.2d //GHASH block 4k+2 - high + eor x7, x7, x14 //AES block 4k+4 - round 10 high + + eor v10.16b, v10.16b, v30.16b //GHASH block 4k+1 - mid + mov d30, v7.d[1] //GHASH block 4k+3 - mid + + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 + eor v11.16b, v11.16b, v29.16b //GHASH block 4k+1 - low + + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 + eor x6, x6, x13 //AES block 4k+4 - round 10 low + + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 + eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid + + pmull2 v4.1q, v7.2d, v12.2d //GHASH block 4k+3 - high + + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 + eor v9.16b, v9.16b, v8.16b //GHASH block 4k+2 - high + + pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid + + pmull v29.1q, v7.1d, v12.1d //GHASH block 4k+3 - low + movi v8.8b, #0xc2 + + pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid + eor v11.16b, v11.16b, v28.16b //GHASH block 4k+2 - low + + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 + + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 + shl d8, d8, #56 //mod_constant + + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 + eor v9.16b, v9.16b, v4.16b //GHASH block 4k+3 - high + + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 + ldp x19, x20, [x0, #16] //AES block 4k+5 - load plaintext +#ifdef __AARCH64EB__ + rev x19, x19 + rev x20, x20 +#endif + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 + eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid + + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 + ldp x21, x22, [x0, #32] //AES block 4k+6 - load plaintext +#ifdef __AARCH64EB__ + rev x21, x21 + rev x22, x22 +#endif + pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid + eor v11.16b, v11.16b, v29.16b //GHASH block 4k+3 - low + + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 + eor x19, x19, x13 //AES block 4k+5 - round 10 low + + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 + eor v10.16b, v10.16b, v30.16b //GHASH block 4k+3 - mid + + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 + eor x23, x23, x13 //AES block 4k+3 - round 10 low + + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 + eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up + + fmov d4, x6 //AES block 4k+4 - mov low + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 + fmov v4.d[1], x7 //AES block 4k+4 - mov high + + add x0, x0, #64 //AES input_ptr update + fmov d7, x23 //AES block 4k+3 - mov low + ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment + + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 + fmov d5, x19 //AES block 4k+5 - mov low + + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 + eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up + + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 + eor x20, x20, x14 //AES block 4k+5 - round 10 high + + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 + fmov v5.d[1], x20 //AES block 4k+5 - mov high + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 + fmov v7.d[1], x24 //AES block 4k+3 - mov high + + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 + cmp x0, x5 //.LOOP CONTROL + + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 + eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid + + aese v0.16b, v27.16b //AES block 4k+4 - round 9 + eor x21, x21, x13 //AES block 4k+6 - round 10 low + eor x22, x22, x14 //AES block 4k+6 - round 10 high + + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 + fmov d6, x21 //AES block 4k+6 - mov low + + aese v1.16b, v27.16b //AES block 4k+5 - round 9 + fmov v6.d[1], x22 //AES block 4k+6 - mov high + + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 + eor v4.16b, v4.16b, v0.16b //AES block 4k+4 - result + + fmov d0, x10 //CTR block 4k+8 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 + + fmov v0.d[1], x9 //CTR block 4k+8 + rev w9, w12 //CTR block 4k+9 + eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 + eor v5.16b, v5.16b, v1.16b //AES block 4k+5 - result + + add w12, w12, #1 //CTR block 4k+9 + orr x9, x11, x9, lsl #32 //CTR block 4k+9 + fmov d1, x10 //CTR block 4k+9 + + pmull v9.1q, v10.1d, v8.1d //MODULO - mid 64b align with low + fmov v1.d[1], x9 //CTR block 4k+9 + rev w9, w12 //CTR block 4k+10 + + aese v2.16b, v27.16b //AES block 4k+6 - round 9 + st1 { v4.16b}, [x2], #16 //AES block 4k+4 - store result + eor v6.16b, v6.16b, v2.16b //AES block 4k+6 - result + orr x9, x11, x9, lsl #32 //CTR block 4k+10 + + aese v3.16b, v27.16b //AES block 4k+7 - round 9 + add w12, w12, #1 //CTR block 4k+10 + ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment + fmov d2, x10 //CTR block 4k+10 + + eor v11.16b, v11.16b, v9.16b //MODULO - fold into low + st1 { v5.16b}, [x2], #16 //AES block 4k+5 - store result + + fmov v2.d[1], x9 //CTR block 4k+10 + st1 { v6.16b}, [x2], #16 //AES block 4k+6 - store result + rev w9, w12 //CTR block 4k+11 + + orr x9, x11, x9, lsl #32 //CTR block 4k+11 + eor v7.16b, v7.16b, v3.16b //AES block 4k+3 - result + + eor v11.16b, v11.16b, v10.16b //MODULO - fold into low + st1 { v7.16b}, [x2], #16 //AES block 4k+3 - store result + b.lt .L128_enc_main_loop + +.L128_enc_prepretail: //PREPRETAIL + rev64 v4.16b, v4.16b //GHASH block 4k (only t0 is free) + fmov d3, x10 //CTR block 4k+3 + rev64 v5.16b, v5.16b //GHASH block 4k+1 (t0 and t1 free) + + ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 + add w12, w12, #1 //CTR block 4k+3 + fmov v3.d[1], x9 //CTR block 4k+3 + + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 + rev64 v6.16b, v6.16b //GHASH block 4k+2 (t0, t1, and t2 free) + + pmull v29.1q, v5.1d, v14.1d //GHASH block 4k+1 - low + + rev64 v7.16b, v7.16b //GHASH block 4k+3 (t0, t1, t2 and t3 free) + eor v4.16b, v4.16b, v11.16b //PRE 1 + + pmull2 v28.1q, v5.2d, v14.2d //GHASH block 4k+1 - high + + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 + mov d30, v5.d[1] //GHASH block 4k+1 - mid + + pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low + mov d8, v4.d[1] //GHASH block 4k - mid + + mov d31, v6.d[1] //GHASH block 4k+2 - mid + mov d10, v17.d[1] //GHASH block 4k - mid + + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 + eor v30.8b, v30.8b, v5.8b //GHASH block 4k+1 - mid + + eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid + + pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high + eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid + + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 + + pmull v30.1q, v30.1d, v17.1d //GHASH block 4k+1 - mid + eor v11.16b, v11.16b, v29.16b //GHASH block 4k+1 - low + + pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid + + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 + ins v31.d[1], v31.d[0] //GHASH block 4k+2 - mid + + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 + + eor v10.16b, v10.16b, v30.16b //GHASH block 4k+1 - mid + mov d30, v7.d[1] //GHASH block 4k+3 - mid + + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 + eor v9.16b, v9.16b, v28.16b //GHASH block 4k+1 - high + + pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid + + pmull2 v8.1q, v6.2d, v13.2d //GHASH block 4k+2 - high + eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid + + pmull2 v4.1q, v7.2d, v12.2d //GHASH block 4k+3 - high + + pmull v28.1q, v6.1d, v13.1d //GHASH block 4k+2 - low + + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 + eor v9.16b, v9.16b, v8.16b //GHASH block 4k+2 - high + + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 + + pmull v29.1q, v7.1d, v12.1d //GHASH block 4k+3 - low + movi v8.8b, #0xc2 + + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 + eor v11.16b, v11.16b, v28.16b //GHASH block 4k+2 - low + + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 + + pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid + eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid + + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 + + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 + eor v9.16b, v9.16b, v4.16b //GHASH block 4k+3 - high + + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 + + eor v10.16b, v10.16b, v30.16b //GHASH block 4k+3 - mid + shl d8, d8, #56 //mod_constant + + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 + eor v11.16b, v11.16b, v29.16b //GHASH block 4k+3 - low + + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 + + pmull v28.1q, v9.1d, v8.1d + eor v10.16b, v10.16b, v9.16b //karatsuba tidy up + + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 + + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 + ext v9.16b, v9.16b, v9.16b, #8 + + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 + + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 + eor v10.16b, v10.16b, v11.16b + + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 + + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 + + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 + + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 + eor v10.16b, v10.16b, v28.16b + + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 + + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 + + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 + + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 + eor v10.16b, v10.16b, v9.16b + + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 + + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 + + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 + + pmull v28.1q, v10.1d, v8.1d + + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 + ext v10.16b, v10.16b, v10.16b, #8 + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 + eor v11.16b, v11.16b, v28.16b + + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 + + aese v3.16b, v27.16b //AES block 4k+7 - round 9 + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 + + aese v0.16b, v27.16b //AES block 4k+4 - round 9 + + aese v1.16b, v27.16b //AES block 4k+5 - round 9 + eor v11.16b, v11.16b, v10.16b + + aese v2.16b, v27.16b //AES block 4k+6 - round 9 +.L128_enc_tail: //TAIL + + sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process + ldp x6, x7, [x0], #16 //AES block 4k+4 - load plaintext +#ifdef __AARCH64EB__ + rev x6, x6 + rev x7, x7 +#endif + cmp x5, #48 + + ext v8.16b, v11.16b, v11.16b, #8 //prepare final partial tag + eor x6, x6, x13 //AES block 4k+4 - round 10 low + eor x7, x7, x14 //AES block 4k+4 - round 10 high + + fmov d4, x6 //AES block 4k+4 - mov low + + fmov v4.d[1], x7 //AES block 4k+4 - mov high + + eor v5.16b, v4.16b, v0.16b //AES block 4k+4 - result + + b.gt .L128_enc_blocks_more_than_3 + + sub w12, w12, #1 + movi v11.8b, #0 + mov v3.16b, v2.16b + + cmp x5, #32 + mov v2.16b, v1.16b + movi v9.8b, #0 + + movi v10.8b, #0 + b.gt .L128_enc_blocks_more_than_2 + + mov v3.16b, v1.16b + cmp x5, #16 + + sub w12, w12, #1 + b.gt .L128_enc_blocks_more_than_1 + + sub w12, w12, #1 + b .L128_enc_blocks_less_than_1 +.L128_enc_blocks_more_than_3: //blocks left > 3 + st1 { v5.16b}, [x2], #16 //AES final-3 block - store result + + ldp x6, x7, [x0], #16 //AES final-2 block - load input low & high +#ifdef __AARCH64EB__ + rev x6, x6 + rev x7, x7 +#endif + rev64 v4.16b, v5.16b //GHASH final-3 block + + eor v4.16b, v4.16b, v8.16b //feed in partial tag + eor x7, x7, x14 //AES final-2 block - round 10 high + eor x6, x6, x13 //AES final-2 block - round 10 low + + fmov d5, x6 //AES final-2 block - mov low + + movi v8.8b, #0 //suppress further partial tag feed in + fmov v5.d[1], x7 //AES final-2 block - mov high + + pmull v11.1q, v4.1d, v15.1d //GHASH final-3 block - low + mov d22, v4.d[1] //GHASH final-3 block - mid + + pmull2 v9.1q, v4.2d, v15.2d //GHASH final-3 block - high + + mov d10, v17.d[1] //GHASH final-3 block - mid + + eor v5.16b, v5.16b, v1.16b //AES final-2 block - result + eor v22.8b, v22.8b, v4.8b //GHASH final-3 block - mid + + pmull v10.1q, v22.1d, v10.1d //GHASH final-3 block - mid +.L128_enc_blocks_more_than_2: //blocks left > 2 + + st1 { v5.16b}, [x2], #16 //AES final-2 block - store result + + rev64 v4.16b, v5.16b //GHASH final-2 block + ldp x6, x7, [x0], #16 //AES final-1 block - load input low & high +#ifdef __AARCH64EB__ + rev x6, x6 + rev x7, x7 +#endif + eor v4.16b, v4.16b, v8.16b //feed in partial tag + + eor x6, x6, x13 //AES final-1 block - round 10 low + + fmov d5, x6 //AES final-1 block - mov low + eor x7, x7, x14 //AES final-1 block - round 10 high + + pmull2 v20.1q, v4.2d, v14.2d //GHASH final-2 block - high + fmov v5.d[1], x7 //AES final-1 block - mov high + + mov d22, v4.d[1] //GHASH final-2 block - mid + + pmull v21.1q, v4.1d, v14.1d //GHASH final-2 block - low + + eor v9.16b, v9.16b, v20.16b //GHASH final-2 block - high + + eor v22.8b, v22.8b, v4.8b //GHASH final-2 block - mid + + eor v5.16b, v5.16b, v2.16b //AES final-1 block - result + + eor v11.16b, v11.16b, v21.16b //GHASH final-2 block - low + + pmull v22.1q, v22.1d, v17.1d //GHASH final-2 block - mid + + movi v8.8b, #0 //suppress further partial tag feed in + + eor v10.16b, v10.16b, v22.16b //GHASH final-2 block - mid +.L128_enc_blocks_more_than_1: //blocks left > 1 + + st1 { v5.16b}, [x2], #16 //AES final-1 block - store result + + rev64 v4.16b, v5.16b //GHASH final-1 block + ldp x6, x7, [x0], #16 //AES final block - load input low & high +#ifdef __AARCH64EB__ + rev x6, x6 + rev x7, x7 +#endif + eor v4.16b, v4.16b, v8.16b //feed in partial tag + + eor x7, x7, x14 //AES final block - round 10 high + eor x6, x6, x13 //AES final block - round 10 low + + fmov d5, x6 //AES final block - mov low + + pmull2 v20.1q, v4.2d, v13.2d //GHASH final-1 block - high + fmov v5.d[1], x7 //AES final block - mov high + + mov d22, v4.d[1] //GHASH final-1 block - mid + + pmull v21.1q, v4.1d, v13.1d //GHASH final-1 block - low + + eor v22.8b, v22.8b, v4.8b //GHASH final-1 block - mid + + eor v5.16b, v5.16b, v3.16b //AES final block - result + + ins v22.d[1], v22.d[0] //GHASH final-1 block - mid + + pmull2 v22.1q, v22.2d, v16.2d //GHASH final-1 block - mid + + eor v11.16b, v11.16b, v21.16b //GHASH final-1 block - low + + eor v9.16b, v9.16b, v20.16b //GHASH final-1 block - high + + eor v10.16b, v10.16b, v22.16b //GHASH final-1 block - mid + movi v8.8b, #0 //suppress further partial tag feed in +.L128_enc_blocks_less_than_1: //blocks left <= 1 + + and x1, x1, #127 //bit_length %= 128 + mvn x13, xzr //rk10_l = 0xffffffffffffffff + + mvn x14, xzr //rk10_h = 0xffffffffffffffff + sub x1, x1, #128 //bit_length -= 128 + + neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128]) + + and x1, x1, #127 //bit_length %= 128 + + lsr x14, x14, x1 //rk10_h is mask for top 64b of last block + cmp x1, #64 + + csel x6, x13, x14, lt + csel x7, x14, xzr, lt + + fmov d0, x6 //ctr0b is mask for last block + + fmov v0.d[1], x7 + + and v5.16b, v5.16b, v0.16b //possibly partial last block has zeroes in highest bits + + rev64 v4.16b, v5.16b //GHASH final block + + eor v4.16b, v4.16b, v8.16b //feed in partial tag + + mov d8, v4.d[1] //GHASH final block - mid + + pmull v21.1q, v4.1d, v12.1d //GHASH final block - low + ld1 { v18.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored + + eor v8.8b, v8.8b, v4.8b //GHASH final block - mid +#ifndef __AARCH64EB__ + rev w9, w12 +#else + mov w9, w12 +#endif + pmull2 v20.1q, v4.2d, v12.2d //GHASH final block - high + + pmull v8.1q, v8.1d, v16.1d //GHASH final block - mid + + eor v11.16b, v11.16b, v21.16b //GHASH final block - low + + eor v9.16b, v9.16b, v20.16b //GHASH final block - high + + eor v10.16b, v10.16b, v8.16b //GHASH final block - mid + movi v8.8b, #0xc2 + + eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up + + shl d8, d8, #56 //mod_constant + + eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up + + pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid + + ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment + + eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid + + eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid + + pmull v9.1q, v10.1d, v8.1d //MODULO - mid 64b align with low + + ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment + + bif v5.16b, v18.16b, v0.16b //insert existing bytes in top end of result before storing + + eor v11.16b, v11.16b, v9.16b //MODULO - fold into low + st1 { v5.16b}, [x2] //store all 16B + + str w9, [x16, #12] //store the updated counter + + eor v11.16b, v11.16b, v10.16b //MODULO - fold into low + ext v11.16b, v11.16b, v11.16b, #8 + rev64 v11.16b, v11.16b + mov x0, x15 + st1 { v11.16b }, [x3] + ldp x21, x22, [sp, #16] + ldp x23, x24, [sp, #32] + ldp d8, d9, [sp, #48] + ldp d10, d11, [sp, #64] + ldp d12, d13, [sp, #80] + ldp d14, d15, [sp, #96] + ldp x19, x20, [sp], #112 + ret + +.L128_enc_ret: + mov w0, #0x0 + ret +.size aes_gcm_enc_128_kernel,.-aes_gcm_enc_128_kernel +.globl aes_gcm_dec_128_kernel +.type aes_gcm_dec_128_kernel,%function +.align 4 +aes_gcm_dec_128_kernel: + cbz x1, .L128_dec_ret + stp x19, x20, [sp, #-112]! + mov x16, x4 + mov x8, x5 + stp x21, x22, [sp, #16] + stp x23, x24, [sp, #32] + stp d8, d9, [sp, #48] + stp d10, d11, [sp, #64] + stp d12, d13, [sp, #80] + stp d14, d15, [sp, #96] + + lsr x5, x1, #3 //byte_len + mov x15, x5 + ldp x10, x11, [x16] //ctr96_b64, ctr96_t32 +#ifdef __AARCH64EB__ + rev x10, x10 + rev x11, x11 +#endif + ldp x13, x14, [x8, #160] //load rk10 +#ifdef __AARCH64EB__ + ror x14, x14, 32 + ror x13, x13, 32 +#endif + sub x5, x5, #1 //byte_len - 1 + ld1 {v18.4s}, [x8], #16 //load rk0 + + and x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail) + ld1 { v0.16b}, [x16] //special case vector load initial counter so we can start first AES block as quickly as possible + + ldr q13, [x3, #64] //load h2l | h2h +#ifndef __AARCH64EB__ + ext v13.16b, v13.16b, v13.16b, #8 +#endif + lsr x12, x11, #32 + fmov d2, x10 //CTR block 2 + + ld1 {v19.4s}, [x8], #16 //load rk1 + orr w11, w11, w11 + rev w12, w12 //rev_ctr32 + + fmov d1, x10 //CTR block 1 + add w12, w12, #1 //increment rev_ctr32 + + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b //AES block 0 - round 0 + rev w9, w12 //CTR block 1 + + orr x9, x11, x9, lsl #32 //CTR block 1 + ld1 {v20.4s}, [x8], #16 //load rk2 + add w12, w12, #1 //CTR block 1 + + fmov v1.d[1], x9 //CTR block 1 + rev w9, w12 //CTR block 2 + add w12, w12, #1 //CTR block 2 + + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b //AES block 0 - round 1 + orr x9, x11, x9, lsl #32 //CTR block 2 + + fmov v2.d[1], x9 //CTR block 2 + rev w9, w12 //CTR block 3 + + fmov d3, x10 //CTR block 3 + orr x9, x11, x9, lsl #32 //CTR block 3 + add w12, w12, #1 //CTR block 3 + + fmov v3.d[1], x9 //CTR block 3 + add x4, x0, x1, lsr #3 //end_input_ptr + + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b //AES block 1 - round 0 + ld1 {v21.4s}, [x8], #16 //load rk3 + + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b //AES block 0 - round 2 + ld1 {v22.4s}, [x8], #16 //load rk4 + + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b //AES block 2 - round 0 + ld1 {v23.4s}, [x8], #16 //load rk5 + + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b //AES block 1 - round 1 + ld1 {v24.4s}, [x8], #16 //load rk6 + + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b //AES block 3 - round 0 + + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b //AES block 2 - round 1 + + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b //AES block 1 - round 2 + + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b //AES block 3 - round 1 + ld1 { v11.16b}, [x3] + ext v11.16b, v11.16b, v11.16b, #8 + rev64 v11.16b, v11.16b + + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b //AES block 0 - round 3 + ld1 {v25.4s}, [x8], #16 //load rk7 + + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b //AES block 1 - round 3 + + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b //AES block 3 - round 2 + + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b //AES block 2 - round 2 + ld1 {v26.4s}, [x8], #16 //load rk8 + + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b //AES block 1 - round 4 + + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b //AES block 3 - round 3 + + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b //AES block 2 - round 3 + ldr q14, [x3, #80] //load h3l | h3h +#ifndef __AARCH64EB__ + ext v14.16b, v14.16b, v14.16b, #8 +#endif + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b //AES block 0 - round 4 + ld1 {v27.4s}, [x8], #16 //load rk9 + + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b //AES block 1 - round 5 + + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b //AES block 2 - round 4 + + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b //AES block 3 - round 4 + + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b //AES block 0 - round 5 + + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b //AES block 2 - round 5 + ldr q12, [x3, #32] //load h1l | h1h +#ifndef __AARCH64EB__ + ext v12.16b, v12.16b, v12.16b, #8 +#endif + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b //AES block 3 - round 5 + + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b //AES block 0 - round 6 + + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b //AES block 1 - round 6 + + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b //AES block 3 - round 6 + + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b //AES block 2 - round 6 + trn1 v8.2d, v12.2d, v13.2d //h2h | h1h + + ldr q15, [x3, #112] //load h4l | h4h +#ifndef __AARCH64EB__ + ext v15.16b, v15.16b, v15.16b, #8 +#endif + trn2 v16.2d, v12.2d, v13.2d //h2l | h1l + add x5, x5, x0 + + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b //AES block 1 - round 7 + + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b //AES block 2 - round 7 + + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b //AES block 0 - round 7 + eor v16.16b, v16.16b, v8.16b //h2k | h1k + + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b //AES block 3 - round 7 + + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 1 - round 8 + trn2 v17.2d, v14.2d, v15.2d //h4l | h3l + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 2 - round 8 + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 3 - round 8 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 0 - round 8 + trn1 v9.2d, v14.2d, v15.2d //h4h | h3h + + aese v2.16b, v27.16b //AES block 2 - round 9 + + aese v3.16b, v27.16b //AES block 3 - round 9 + + aese v0.16b, v27.16b //AES block 0 - round 9 + cmp x0, x5 //check if we have <= 4 blocks + + aese v1.16b, v27.16b //AES block 1 - round 9 + eor v17.16b, v17.16b, v9.16b //h4k | h3k + b.ge .L128_dec_tail //handle tail + + ld1 {v4.16b, v5.16b}, [x0], #32 //AES block 0 - load ciphertext; AES block 1 - load ciphertext + + eor v1.16b, v5.16b, v1.16b //AES block 1 - result + ld1 {v6.16b}, [x0], #16 //AES block 2 - load ciphertext + + eor v0.16b, v4.16b, v0.16b //AES block 0 - result + rev64 v4.16b, v4.16b //GHASH block 0 + rev w9, w12 //CTR block 4 + + orr x9, x11, x9, lsl #32 //CTR block 4 + add w12, w12, #1 //CTR block 4 + ld1 {v7.16b}, [x0], #16 //AES block 3 - load ciphertext + + rev64 v5.16b, v5.16b //GHASH block 1 + mov x19, v1.d[0] //AES block 1 - mov low + + mov x20, v1.d[1] //AES block 1 - mov high + + mov x6, v0.d[0] //AES block 0 - mov low + cmp x0, x5 //check if we have <= 8 blocks + + mov x7, v0.d[1] //AES block 0 - mov high + + fmov d0, x10 //CTR block 4 + + fmov v0.d[1], x9 //CTR block 4 + rev w9, w12 //CTR block 5 + eor x19, x19, x13 //AES block 1 - round 10 low +#ifdef __AARCH64EB__ + rev x19, x19 +#endif + fmov d1, x10 //CTR block 5 + add w12, w12, #1 //CTR block 5 + orr x9, x11, x9, lsl #32 //CTR block 5 + + fmov v1.d[1], x9 //CTR block 5 + rev w9, w12 //CTR block 6 + add w12, w12, #1 //CTR block 6 + + orr x9, x11, x9, lsl #32 //CTR block 6 + + eor x20, x20, x14 //AES block 1 - round 10 high +#ifdef __AARCH64EB__ + rev x20, x20 +#endif + eor x6, x6, x13 //AES block 0 - round 10 low +#ifdef __AARCH64EB__ + rev x6, x6 +#endif + eor v2.16b, v6.16b, v2.16b //AES block 2 - result + + eor x7, x7, x14 //AES block 0 - round 10 high +#ifdef __AARCH64EB__ + rev x7, x7 +#endif + stp x6, x7, [x2], #16 //AES block 0 - store result + + stp x19, x20, [x2], #16 //AES block 1 - store result + b.ge .L128_dec_prepretail //do prepretail + +.L128_dec_main_loop: //main loop start + eor v3.16b, v7.16b, v3.16b //AES block 4k+3 - result + ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 + mov x21, v2.d[0] //AES block 4k+2 - mov low + + pmull2 v28.1q, v5.2d, v14.2d //GHASH block 4k+1 - high + mov x22, v2.d[1] //AES block 4k+2 - mov high + + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 + fmov d2, x10 //CTR block 4k+6 + + rev64 v6.16b, v6.16b //GHASH block 4k+2 + fmov v2.d[1], x9 //CTR block 4k+6 + rev w9, w12 //CTR block 4k+7 + + mov x23, v3.d[0] //AES block 4k+3 - mov low + eor v4.16b, v4.16b, v11.16b //PRE 1 + mov d30, v5.d[1] //GHASH block 4k+1 - mid + + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 + rev64 v7.16b, v7.16b //GHASH block 4k+3 + + pmull v29.1q, v5.1d, v14.1d //GHASH block 4k+1 - low + mov x24, v3.d[1] //AES block 4k+3 - mov high + orr x9, x11, x9, lsl #32 //CTR block 4k+7 + + pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low + fmov d3, x10 //CTR block 4k+7 + eor v30.8b, v30.8b, v5.8b //GHASH block 4k+1 - mid + + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 + fmov v3.d[1], x9 //CTR block 4k+7 + + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 + mov d10, v17.d[1] //GHASH block 4k - mid + + pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high + eor v11.16b, v11.16b, v29.16b //GHASH block 4k+1 - low + + pmull v29.1q, v7.1d, v12.1d //GHASH block 4k+3 - low + + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 + mov d8, v4.d[1] //GHASH block 4k - mid + + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 + eor v9.16b, v9.16b, v28.16b //GHASH block 4k+1 - high + + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 + + pmull v28.1q, v6.1d, v13.1d //GHASH block 4k+2 - low + eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid + + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 + eor x23, x23, x13 //AES block 4k+3 - round 10 low +#ifdef __AARCH64EB__ + rev x23, x23 +#endif + pmull v30.1q, v30.1d, v17.1d //GHASH block 4k+1 - mid + eor x22, x22, x14 //AES block 4k+2 - round 10 high +#ifdef __AARCH64EB__ + rev x22, x22 +#endif + mov d31, v6.d[1] //GHASH block 4k+2 - mid + + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 + eor v11.16b, v11.16b, v28.16b //GHASH block 4k+2 - low + + pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid + + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 + eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid + + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 + + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 + eor v10.16b, v10.16b, v30.16b //GHASH block 4k+1 - mid + + pmull2 v8.1q, v6.2d, v13.2d //GHASH block 4k+2 - high + + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 + ins v31.d[1], v31.d[0] //GHASH block 4k+2 - mid + + pmull2 v4.1q, v7.2d, v12.2d //GHASH block 4k+3 - high + + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 + mov d30, v7.d[1] //GHASH block 4k+3 - mid + + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 + eor v9.16b, v9.16b, v8.16b //GHASH block 4k+2 - high + + pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid + eor x24, x24, x14 //AES block 4k+3 - round 10 high +#ifdef __AARCH64EB__ + rev x24, x24 +#endif + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 + eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid + + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 + eor x21, x21, x13 //AES block 4k+2 - round 10 low +#ifdef __AARCH64EB__ + rev x21, x21 +#endif + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 + movi v8.8b, #0xc2 + + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 + eor v11.16b, v11.16b, v29.16b //GHASH block 4k+3 - low + + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 + + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 + eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid + + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 + stp x21, x22, [x2], #16 //AES block 4k+2 - store result + + pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid + eor v9.16b, v9.16b, v4.16b //GHASH block 4k+3 - high + ld1 {v4.16b}, [x0], #16 //AES block 4k+3 - load ciphertext + + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 + add w12, w12, #1 //CTR block 4k+7 + + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 + shl d8, d8, #56 //mod_constant + + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 + eor v10.16b, v10.16b, v30.16b //GHASH block 4k+3 - mid + + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 + stp x23, x24, [x2], #16 //AES block 4k+3 - store result + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 + eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up + + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 + rev w9, w12 //CTR block 4k+8 + + pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid + ld1 {v5.16b}, [x0], #16 //AES block 4k+4 - load ciphertext + ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment + + aese v0.16b, v27.16b //AES block 4k+4 - round 9 + orr x9, x11, x9, lsl #32 //CTR block 4k+8 + + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 + eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up + + aese v1.16b, v27.16b //AES block 4k+5 - round 9 + + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 + eor v0.16b, v4.16b, v0.16b //AES block 4k+4 - result + + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 + ld1 {v6.16b}, [x0], #16 //AES block 4k+5 - load ciphertext + + add w12, w12, #1 //CTR block 4k+8 + eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid + eor v1.16b, v5.16b, v1.16b //AES block 4k+5 - result + + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 + ld1 {v7.16b}, [x0], #16 //AES block 4k+6 - load ciphertext + + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 + + rev64 v5.16b, v5.16b //GHASH block 4k+5 + eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid + mov x7, v0.d[1] //AES block 4k+4 - mov high + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 + mov x6, v0.d[0] //AES block 4k+4 - mov low + + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 + fmov d0, x10 //CTR block 4k+8 + + pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low + fmov v0.d[1], x9 //CTR block 4k+8 + rev w9, w12 //CTR block 4k+9 + + aese v2.16b, v27.16b //AES block 4k+6 - round 9 + orr x9, x11, x9, lsl #32 //CTR block 4k+9 + ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 + eor x7, x7, x14 //AES block 4k+4 - round 10 high +#ifdef __AARCH64EB__ + rev x7, x7 +#endif + eor v11.16b, v11.16b, v8.16b //MODULO - fold into low + mov x20, v1.d[1] //AES block 4k+5 - mov high + eor x6, x6, x13 //AES block 4k+4 - round 10 low +#ifdef __AARCH64EB__ + rev x6, x6 +#endif + eor v2.16b, v6.16b, v2.16b //AES block 4k+6 - result + mov x19, v1.d[0] //AES block 4k+5 - mov low + add w12, w12, #1 //CTR block 4k+9 + + aese v3.16b, v27.16b //AES block 4k+7 - round 9 + fmov d1, x10 //CTR block 4k+9 + cmp x0, x5 //.LOOP CONTROL + + rev64 v4.16b, v4.16b //GHASH block 4k+4 + eor v11.16b, v11.16b, v10.16b //MODULO - fold into low + fmov v1.d[1], x9 //CTR block 4k+9 + + rev w9, w12 //CTR block 4k+10 + add w12, w12, #1 //CTR block 4k+10 + + eor x20, x20, x14 //AES block 4k+5 - round 10 high +#ifdef __AARCH64EB__ + rev x20, x20 +#endif + stp x6, x7, [x2], #16 //AES block 4k+4 - store result + + eor x19, x19, x13 //AES block 4k+5 - round 10 low +#ifdef __AARCH64EB__ + rev x19, x19 +#endif + stp x19, x20, [x2], #16 //AES block 4k+5 - store result + + orr x9, x11, x9, lsl #32 //CTR block 4k+10 + b.lt .L128_dec_main_loop + +.L128_dec_prepretail: //PREPRETAIL + ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 + mov x21, v2.d[0] //AES block 4k+2 - mov low + mov d30, v5.d[1] //GHASH block 4k+1 - mid + + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 + eor v3.16b, v7.16b, v3.16b //AES block 4k+3 - result + + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 + mov x22, v2.d[1] //AES block 4k+2 - mov high + + eor v4.16b, v4.16b, v11.16b //PRE 1 + fmov d2, x10 //CTR block 4k+6 + rev64 v6.16b, v6.16b //GHASH block 4k+2 + + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 + fmov v2.d[1], x9 //CTR block 4k+6 + + rev w9, w12 //CTR block 4k+7 + mov x23, v3.d[0] //AES block 4k+3 - mov low + eor v30.8b, v30.8b, v5.8b //GHASH block 4k+1 - mid + + pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low + mov d10, v17.d[1] //GHASH block 4k - mid + mov x24, v3.d[1] //AES block 4k+3 - mov high + + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 + mov d31, v6.d[1] //GHASH block 4k+2 - mid + + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 + orr x9, x11, x9, lsl #32 //CTR block 4k+7 + + pmull v29.1q, v5.1d, v14.1d //GHASH block 4k+1 - low + mov d8, v4.d[1] //GHASH block 4k - mid + fmov d3, x10 //CTR block 4k+7 + + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 + fmov v3.d[1], x9 //CTR block 4k+7 + + pmull v30.1q, v30.1d, v17.1d //GHASH block 4k+1 - mid + eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid + + rev64 v7.16b, v7.16b //GHASH block 4k+3 + + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 + eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid + + pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high + + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 + ins v31.d[1], v31.d[0] //GHASH block 4k+2 - mid + + pmull2 v28.1q, v5.2d, v14.2d //GHASH block 4k+1 - high + + pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid + eor v11.16b, v11.16b, v29.16b //GHASH block 4k+1 - low + + pmull v29.1q, v7.1d, v12.1d //GHASH block 4k+3 - low + + pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid + eor v9.16b, v9.16b, v28.16b //GHASH block 4k+1 - high + + eor v10.16b, v10.16b, v30.16b //GHASH block 4k+1 - mid + + pmull2 v4.1q, v7.2d, v12.2d //GHASH block 4k+3 - high + + pmull2 v8.1q, v6.2d, v13.2d //GHASH block 4k+2 - high + mov d30, v7.d[1] //GHASH block 4k+3 - mid + + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 + eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid + + pmull v28.1q, v6.1d, v13.1d //GHASH block 4k+2 - low + + eor v9.16b, v9.16b, v8.16b //GHASH block 4k+2 - high + movi v8.8b, #0xc2 + + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 + eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid + + eor v11.16b, v11.16b, v28.16b //GHASH block 4k+2 - low + + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 + eor v9.16b, v9.16b, v4.16b //GHASH block 4k+3 - high + + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 + eor x23, x23, x13 //AES block 4k+3 - round 10 low +#ifdef __AARCH64EB__ + rev x23, x23 +#endif + pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid + eor x21, x21, x13 //AES block 4k+2 - round 10 low +#ifdef __AARCH64EB__ + rev x21, x21 +#endif + eor v11.16b, v11.16b, v29.16b //GHASH block 4k+3 - low + + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 + + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 + shl d8, d8, #56 //mod_constant + + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 + + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 + eor v10.16b, v10.16b, v30.16b //GHASH block 4k+3 - mid + + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 + + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 + eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up + + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 + + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 + + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 + + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 + eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up + + pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid + + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 + ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment + + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 + + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 + eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid + + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 + + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 + + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 + + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 + eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid + + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 + + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 + + aese v1.16b, v27.16b //AES block 4k+5 - round 9 + + pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low + eor x24, x24, x14 //AES block 4k+3 - round 10 high +#ifdef __AARCH64EB__ + rev x24, x24 +#endif + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 + ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment + + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 + eor v11.16b, v11.16b, v8.16b //MODULO - fold into low + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 + eor x22, x22, x14 //AES block 4k+2 - round 10 high +#ifdef __AARCH64EB__ + rev x22, x22 +#endif + aese v0.16b, v27.16b //AES block 4k+4 - round 9 + stp x21, x22, [x2], #16 //AES block 4k+2 - store result + + aese v2.16b, v27.16b //AES block 4k+6 - round 9 + add w12, w12, #1 //CTR block 4k+7 + stp x23, x24, [x2], #16 //AES block 4k+3 - store result + + aese v3.16b, v27.16b //AES block 4k+7 - round 9 + eor v11.16b, v11.16b, v10.16b //MODULO - fold into low +.L128_dec_tail: //TAIL + + sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process + ld1 { v5.16b}, [x0], #16 //AES block 4k+4 - load ciphertext + + eor v0.16b, v5.16b, v0.16b //AES block 4k+4 - result + + mov x7, v0.d[1] //AES block 4k+4 - mov high + + mov x6, v0.d[0] //AES block 4k+4 - mov low + + cmp x5, #48 + + eor x7, x7, x14 //AES block 4k+4 - round 10 high +#ifdef __AARCH64EB__ + rev x7, x7 +#endif + ext v8.16b, v11.16b, v11.16b, #8 //prepare final partial tag + eor x6, x6, x13 //AES block 4k+4 - round 10 low +#ifdef __AARCH64EB__ + rev x6, x6 +#endif + b.gt .L128_dec_blocks_more_than_3 + + mov v3.16b, v2.16b + sub w12, w12, #1 + movi v11.8b, #0 + + movi v9.8b, #0 + mov v2.16b, v1.16b + + movi v10.8b, #0 + cmp x5, #32 + b.gt .L128_dec_blocks_more_than_2 + + cmp x5, #16 + + mov v3.16b, v1.16b + sub w12, w12, #1 + b.gt .L128_dec_blocks_more_than_1 + + sub w12, w12, #1 + b .L128_dec_blocks_less_than_1 +.L128_dec_blocks_more_than_3: //blocks left > 3 + rev64 v4.16b, v5.16b //GHASH final-3 block + ld1 { v5.16b}, [x0], #16 //AES final-2 block - load ciphertext + + eor v4.16b, v4.16b, v8.16b //feed in partial tag + + mov d10, v17.d[1] //GHASH final-3 block - mid + stp x6, x7, [x2], #16 //AES final-3 block - store result + eor v0.16b, v5.16b, v1.16b //AES final-2 block - result + + mov d22, v4.d[1] //GHASH final-3 block - mid + mov x7, v0.d[1] //AES final-2 block - mov high + + pmull v11.1q, v4.1d, v15.1d //GHASH final-3 block - low + mov x6, v0.d[0] //AES final-2 block - mov low + + pmull2 v9.1q, v4.2d, v15.2d //GHASH final-3 block - high + + eor v22.8b, v22.8b, v4.8b //GHASH final-3 block - mid + + movi v8.8b, #0 //suppress further partial tag feed in + eor x7, x7, x14 //AES final-2 block - round 10 high +#ifdef __AARCH64EB__ + rev x7, x7 +#endif + pmull v10.1q, v22.1d, v10.1d //GHASH final-3 block - mid + eor x6, x6, x13 //AES final-2 block - round 10 low +#ifdef __AARCH64EB__ + rev x6, x6 +#endif +.L128_dec_blocks_more_than_2: //blocks left > 2 + + rev64 v4.16b, v5.16b //GHASH final-2 block + ld1 { v5.16b}, [x0], #16 //AES final-1 block - load ciphertext + + eor v4.16b, v4.16b, v8.16b //feed in partial tag + + eor v0.16b, v5.16b, v2.16b //AES final-1 block - result + stp x6, x7, [x2], #16 //AES final-2 block - store result + + mov d22, v4.d[1] //GHASH final-2 block - mid + + pmull v21.1q, v4.1d, v14.1d //GHASH final-2 block - low + + pmull2 v20.1q, v4.2d, v14.2d //GHASH final-2 block - high + mov x6, v0.d[0] //AES final-1 block - mov low + + mov x7, v0.d[1] //AES final-1 block - mov high + eor v22.8b, v22.8b, v4.8b //GHASH final-2 block - mid + + movi v8.8b, #0 //suppress further partial tag feed in + + pmull v22.1q, v22.1d, v17.1d //GHASH final-2 block - mid + + eor x6, x6, x13 //AES final-1 block - round 10 low +#ifdef __AARCH64EB__ + rev x6, x6 +#endif + eor v11.16b, v11.16b, v21.16b //GHASH final-2 block - low + + eor v9.16b, v9.16b, v20.16b //GHASH final-2 block - high + + eor v10.16b, v10.16b, v22.16b //GHASH final-2 block - mid + eor x7, x7, x14 //AES final-1 block - round 10 high +#ifdef __AARCH64EB__ + rev x7, x7 +#endif +.L128_dec_blocks_more_than_1: //blocks left > 1 + + rev64 v4.16b, v5.16b //GHASH final-1 block + + ld1 { v5.16b}, [x0], #16 //AES final block - load ciphertext + eor v4.16b, v4.16b, v8.16b //feed in partial tag + + mov d22, v4.d[1] //GHASH final-1 block - mid + + eor v0.16b, v5.16b, v3.16b //AES final block - result + + eor v22.8b, v22.8b, v4.8b //GHASH final-1 block - mid + + stp x6, x7, [x2], #16 //AES final-1 block - store result + mov x6, v0.d[0] //AES final block - mov low + + mov x7, v0.d[1] //AES final block - mov high + ins v22.d[1], v22.d[0] //GHASH final-1 block - mid + + pmull v21.1q, v4.1d, v13.1d //GHASH final-1 block - low + + pmull2 v20.1q, v4.2d, v13.2d //GHASH final-1 block - high + + pmull2 v22.1q, v22.2d, v16.2d //GHASH final-1 block - mid + movi v8.8b, #0 //suppress further partial tag feed in + + eor v11.16b, v11.16b, v21.16b //GHASH final-1 block - low + + eor v9.16b, v9.16b, v20.16b //GHASH final-1 block - high + eor x7, x7, x14 //AES final block - round 10 high +#ifdef __AARCH64EB__ + rev x7, x7 +#endif + eor x6, x6, x13 //AES final block - round 10 low +#ifdef __AARCH64EB__ + rev x6, x6 +#endif + eor v10.16b, v10.16b, v22.16b //GHASH final-1 block - mid +.L128_dec_blocks_less_than_1: //blocks left <= 1 + + mvn x14, xzr //rk10_h = 0xffffffffffffffff + and x1, x1, #127 //bit_length %= 128 + + mvn x13, xzr //rk10_l = 0xffffffffffffffff + sub x1, x1, #128 //bit_length -= 128 + + neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128]) + + and x1, x1, #127 //bit_length %= 128 + + lsr x14, x14, x1 //rk10_h is mask for top 64b of last block + cmp x1, #64 + + csel x10, x14, xzr, lt + csel x9, x13, x14, lt + + fmov d0, x9 //ctr0b is mask for last block + + mov v0.d[1], x10 + + and v5.16b, v5.16b, v0.16b //possibly partial last block has zeroes in highest bits + + rev64 v4.16b, v5.16b //GHASH final block + + eor v4.16b, v4.16b, v8.16b //feed in partial tag + + ldp x4, x5, [x2] //load existing bytes we need to not overwrite + + and x7, x7, x10 + + pmull2 v20.1q, v4.2d, v12.2d //GHASH final block - high + mov d8, v4.d[1] //GHASH final block - mid + + eor v8.8b, v8.8b, v4.8b //GHASH final block - mid + eor v9.16b, v9.16b, v20.16b //GHASH final block - high + + pmull v8.1q, v8.1d, v16.1d //GHASH final block - mid + + pmull v21.1q, v4.1d, v12.1d //GHASH final block - low + bic x4, x4, x9 //mask out low existing bytes + and x6, x6, x9 + +#ifndef __AARCH64EB__ + rev w9, w12 +#else + mov w9, w12 +#endif + + eor v10.16b, v10.16b, v8.16b //GHASH final block - mid + movi v8.8b, #0xc2 + + eor v11.16b, v11.16b, v21.16b //GHASH final block - low + + bic x5, x5, x10 //mask out high existing bytes + shl d8, d8, #56 //mod_constant + + eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up + + pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid + + eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up + + orr x6, x6, x4 + str w9, [x16, #12] //store the updated counter + + orr x7, x7, x5 + stp x6, x7, [x2] + ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment + + eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid + + eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid + + pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low + ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment + + eor v11.16b, v11.16b, v8.16b //MODULO - fold into low + + eor v11.16b, v11.16b, v10.16b //MODULO - fold into low + ext v11.16b, v11.16b, v11.16b, #8 + rev64 v11.16b, v11.16b + mov x0, x15 + st1 { v11.16b }, [x3] + + ldp x21, x22, [sp, #16] + ldp x23, x24, [sp, #32] + ldp d8, d9, [sp, #48] + ldp d10, d11, [sp, #64] + ldp d12, d13, [sp, #80] + ldp d14, d15, [sp, #96] + ldp x19, x20, [sp], #112 + ret + +.L128_dec_ret: + mov w0, #0x0 + ret +.size aes_gcm_dec_128_kernel,.-aes_gcm_dec_128_kernel +.globl aes_gcm_enc_192_kernel +.type aes_gcm_enc_192_kernel,%function +.align 4 +aes_gcm_enc_192_kernel: + cbz x1, .L192_enc_ret + stp x19, x20, [sp, #-112]! + mov x16, x4 + mov x8, x5 + stp x21, x22, [sp, #16] + stp x23, x24, [sp, #32] + stp d8, d9, [sp, #48] + stp d10, d11, [sp, #64] + stp d12, d13, [sp, #80] + stp d14, d15, [sp, #96] + + ldp x10, x11, [x16] //ctr96_b64, ctr96_t32 +#ifdef __AARCH64EB__ + rev x10, x10 + rev x11, x11 +#endif + ldp x13, x14, [x8, #192] //load rk12 +#ifdef __AARCH64EB__ + ror x13, x13, #32 + ror x14, x14, #32 +#endif + ld1 {v18.4s}, [x8], #16 //load rk0 + + ld1 {v19.4s}, [x8], #16 //load rk1 + + ld1 {v20.4s}, [x8], #16 //load rk2 + + lsr x12, x11, #32 + ld1 {v21.4s}, [x8], #16 //load rk3 + orr w11, w11, w11 + + ld1 {v22.4s}, [x8], #16 //load rk4 + rev w12, w12 //rev_ctr32 + + add w12, w12, #1 //increment rev_ctr32 + fmov d3, x10 //CTR block 3 + + rev w9, w12 //CTR block 1 + add w12, w12, #1 //CTR block 1 + fmov d1, x10 //CTR block 1 + + orr x9, x11, x9, lsl #32 //CTR block 1 + ld1 { v0.16b}, [x16] //special case vector load initial counter so we can start first AES block as quickly as possible + + fmov v1.d[1], x9 //CTR block 1 + rev w9, w12 //CTR block 2 + add w12, w12, #1 //CTR block 2 + + fmov d2, x10 //CTR block 2 + orr x9, x11, x9, lsl #32 //CTR block 2 + + fmov v2.d[1], x9 //CTR block 2 + rev w9, w12 //CTR block 3 + + orr x9, x11, x9, lsl #32 //CTR block 3 + ld1 {v23.4s}, [x8], #16 //load rk5 + + fmov v3.d[1], x9 //CTR block 3 + + ld1 {v24.4s}, [x8], #16 //load rk6 + + ld1 {v25.4s}, [x8], #16 //load rk7 + + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b //AES block 0 - round 0 + ld1 { v11.16b}, [x3] + ext v11.16b, v11.16b, v11.16b, #8 + rev64 v11.16b, v11.16b + + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b //AES block 3 - round 0 + ld1 {v26.4s}, [x8], #16 //load rk8 + + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b //AES block 1 - round 0 + ldr q15, [x3, #112] //load h4l | h4h +#ifndef __AARCH64EB__ + ext v15.16b, v15.16b, v15.16b, #8 +#endif + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b //AES block 2 - round 0 + ld1 {v27.4s}, [x8], #16 //load rk9 + + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b //AES block 0 - round 1 + ld1 {v28.4s}, [x8], #16 //load rk10 + + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b //AES block 1 - round 1 + ldr q12, [x3, #32] //load h1l | h1h +#ifndef __AARCH64EB__ + ext v12.16b, v12.16b, v12.16b, #8 +#endif + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b //AES block 2 - round 1 + ld1 {v29.4s}, [x8], #16 //load rk11 + + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b //AES block 3 - round 1 + ldr q14, [x3, #80] //load h3l | h3h +#ifndef __AARCH64EB__ + ext v14.16b, v14.16b, v14.16b, #8 +#endif + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b //AES block 0 - round 2 + + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b //AES block 2 - round 2 + + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b //AES block 3 - round 2 + + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b //AES block 0 - round 3 + trn1 v9.2d, v14.2d, v15.2d //h4h | h3h + + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b //AES block 2 - round 3 + + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b //AES block 1 - round 2 + trn2 v17.2d, v14.2d, v15.2d //h4l | h3l + + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b //AES block 0 - round 4 + + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b //AES block 3 - round 3 + + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b //AES block 1 - round 3 + + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b //AES block 0 - round 5 + + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b //AES block 2 - round 4 + + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b //AES block 1 - round 4 + + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b //AES block 0 - round 6 + + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b //AES block 3 - round 4 + + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b //AES block 2 - round 5 + + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b //AES block 1 - round 5 + + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b //AES block 3 - round 5 + + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b //AES block 2 - round 6 + ldr q13, [x3, #64] //load h2l | h2h +#ifndef __AARCH64EB__ + ext v13.16b, v13.16b, v13.16b, #8 +#endif + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b //AES block 1 - round 6 + + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b //AES block 3 - round 6 + + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b //AES block 0 - round 7 + + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b //AES block 1 - round 7 + trn2 v16.2d, v12.2d, v13.2d //h2l | h1l + + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b //AES block 3 - round 7 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 0 - round 8 + + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b //AES block 2 - round 7 + trn1 v8.2d, v12.2d, v13.2d //h2h | h1h + + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 1 - round 8 + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 3 - round 8 + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 2 - round 8 + + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 0 - round 9 + + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 3 - round 9 + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 2 - round 9 + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 1 - round 9 + + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 0 - round 10 + + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 2 - round 10 + + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 1 - round 10 + lsr x5, x1, #3 //byte_len + mov x15, x5 + + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 3 - round 10 + sub x5, x5, #1 //byte_len - 1 + + eor v16.16b, v16.16b, v8.16b //h2k | h1k + and x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail) + + eor v17.16b, v17.16b, v9.16b //h4k | h3k + + aese v2.16b, v29.16b //AES block 2 - round 11 + add x4, x0, x1, lsr #3 //end_input_ptr + add x5, x5, x0 + + aese v1.16b, v29.16b //AES block 1 - round 11 + cmp x0, x5 //check if we have <= 4 blocks + + aese v0.16b, v29.16b //AES block 0 - round 11 + add w12, w12, #1 //CTR block 3 + + aese v3.16b, v29.16b //AES block 3 - round 11 + b.ge .L192_enc_tail //handle tail + + rev w9, w12 //CTR block 4 + ldp x6, x7, [x0, #0] //AES block 0 - load plaintext +#ifdef __AARCH64EB__ + rev x6, x6 + rev x7, x7 +#endif + orr x9, x11, x9, lsl #32 //CTR block 4 + ldp x21, x22, [x0, #32] //AES block 2 - load plaintext +#ifdef __AARCH64EB__ + rev x21, x21 + rev x22, x22 +#endif + ldp x23, x24, [x0, #48] //AES block 3 - load plaintext +#ifdef __AARCH64EB__ + rev x23, x23 + rev x24, x24 +#endif + ldp x19, x20, [x0, #16] //AES block 1 - load plaintext +#ifdef __AARCH64EB__ + rev x19, x19 + rev x20, x20 +#endif + add x0, x0, #64 //AES input_ptr update + cmp x0, x5 //check if we have <= 8 blocks + + eor x6, x6, x13 //AES block 0 - round 12 low + + eor x7, x7, x14 //AES block 0 - round 12 high + eor x22, x22, x14 //AES block 2 - round 12 high + fmov d4, x6 //AES block 0 - mov low + + eor x24, x24, x14 //AES block 3 - round 12 high + fmov v4.d[1], x7 //AES block 0 - mov high + + eor x21, x21, x13 //AES block 2 - round 12 low + eor x19, x19, x13 //AES block 1 - round 12 low + + fmov d5, x19 //AES block 1 - mov low + eor x20, x20, x14 //AES block 1 - round 12 high + + fmov v5.d[1], x20 //AES block 1 - mov high + + eor x23, x23, x13 //AES block 3 - round 12 low + fmov d6, x21 //AES block 2 - mov low + + add w12, w12, #1 //CTR block 4 + eor v4.16b, v4.16b, v0.16b //AES block 0 - result + fmov d0, x10 //CTR block 4 + + fmov v0.d[1], x9 //CTR block 4 + rev w9, w12 //CTR block 5 + + orr x9, x11, x9, lsl #32 //CTR block 5 + add w12, w12, #1 //CTR block 5 + + fmov d7, x23 //AES block 3 - mov low + st1 { v4.16b}, [x2], #16 //AES block 0 - store result + + fmov v6.d[1], x22 //AES block 2 - mov high + + eor v5.16b, v5.16b, v1.16b //AES block 1 - result + fmov d1, x10 //CTR block 5 + st1 { v5.16b}, [x2], #16 //AES block 1 - store result + + fmov v7.d[1], x24 //AES block 3 - mov high + + fmov v1.d[1], x9 //CTR block 5 + rev w9, w12 //CTR block 6 + + orr x9, x11, x9, lsl #32 //CTR block 6 + + add w12, w12, #1 //CTR block 6 + eor v6.16b, v6.16b, v2.16b //AES block 2 - result + fmov d2, x10 //CTR block 6 + + fmov v2.d[1], x9 //CTR block 6 + rev w9, w12 //CTR block 7 + + orr x9, x11, x9, lsl #32 //CTR block 7 + st1 { v6.16b}, [x2], #16 //AES block 2 - store result + + eor v7.16b, v7.16b, v3.16b //AES block 3 - result + st1 { v7.16b}, [x2], #16 //AES block 3 - store result + b.ge .L192_enc_prepretail //do prepretail + +.L192_enc_main_loop: //main loop start + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 + rev64 v5.16b, v5.16b //GHASH block 4k+1 (t0 and t1 free) + + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 + ldp x19, x20, [x0, #16] //AES block 4k+5 - load plaintext +#ifdef __AARCH64EB__ + rev x19, x19 + rev x20, x20 +#endif + ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 + fmov d3, x10 //CTR block 4k+3 + rev64 v4.16b, v4.16b //GHASH block 4k (only t0 is free) + + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 + fmov v3.d[1], x9 //CTR block 4k+3 + + pmull2 v30.1q, v5.2d, v14.2d //GHASH block 4k+1 - high + rev64 v7.16b, v7.16b //GHASH block 4k+3 (t0, t1, t2 and t3 free) + ldp x21, x22, [x0, #32] //AES block 4k+6 - load plaintext +#ifdef __AARCH64EB__ + rev x21, x21 + rev x22, x22 +#endif + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 + ldp x23, x24, [x0, #48] //AES block 4k+3 - load plaintext +#ifdef __AARCH64EB__ + rev x23, x23 + rev x24, x24 +#endif + pmull v31.1q, v5.1d, v14.1d //GHASH block 4k+1 - low + eor v4.16b, v4.16b, v11.16b //PRE 1 + + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 + + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 + rev64 v6.16b, v6.16b //GHASH block 4k+2 (t0, t1, and t2 free) + + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 + eor x24, x24, x14 //AES block 4k+3 - round 12 high + + pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low + mov d8, v4.d[1] //GHASH block 4k - mid + + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 + + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 + eor x21, x21, x13 //AES block 4k+6 - round 12 low + + eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid + eor v11.16b, v11.16b, v31.16b //GHASH block 4k+1 - low + + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 + eor x19, x19, x13 //AES block 4k+5 - round 12 low + + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 + mov d31, v6.d[1] //GHASH block 4k+2 - mid + + pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high + mov d4, v5.d[1] //GHASH block 4k+1 - mid + + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 + + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 + + mov d10, v17.d[1] //GHASH block 4k - mid + eor v9.16b, v9.16b, v30.16b //GHASH block 4k+1 - high + + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 + eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid + + pmull2 v30.1q, v6.2d, v13.2d //GHASH block 4k+2 - high + + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 + eor v4.8b, v4.8b, v5.8b //GHASH block 4k+1 - mid + + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 + + pmull2 v5.1q, v7.2d, v12.2d //GHASH block 4k+3 - high + eor x20, x20, x14 //AES block 4k+5 - round 12 high + ins v31.d[1], v31.d[0] //GHASH block 4k+2 - mid + + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 + add w12, w12, #1 //CTR block 4k+3 + + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 + eor v9.16b, v9.16b, v30.16b //GHASH block 4k+2 - high + + pmull v4.1q, v4.1d, v17.1d //GHASH block 4k+1 - mid + eor x22, x22, x14 //AES block 4k+6 - round 12 high + + pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid + eor x23, x23, x13 //AES block 4k+3 - round 12 low + mov d30, v7.d[1] //GHASH block 4k+3 - mid + + pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid + rev w9, w12 //CTR block 4k+8 + + pmull v8.1q, v6.1d, v13.1d //GHASH block 4k+2 - low + orr x9, x11, x9, lsl #32 //CTR block 4k+8 + + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 + eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid + + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 + ldp x6, x7, [x0, #0] //AES block 4k+4 - load plaintext +#ifdef __AARCH64EB__ + rev x6, x6 + rev x7, x7 +#endif + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 + eor v11.16b, v11.16b, v8.16b //GHASH block 4k+2 - low + + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 + add x0, x0, #64 //AES input_ptr update + + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 + movi v8.8b, #0xc2 + + pmull v6.1q, v7.1d, v12.1d //GHASH block 4k+3 - low + eor x7, x7, x14 //AES block 4k+4 - round 12 high + eor v10.16b, v10.16b, v4.16b //GHASH block 4k+1 - mid + + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 + eor x6, x6, x13 //AES block 4k+4 - round 12 low + + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 + shl d8, d8, #56 //mod_constant + + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 + eor v9.16b, v9.16b, v5.16b //GHASH block 4k+3 - high + + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 + fmov d5, x19 //AES block 4k+5 - mov low + + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 + eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid + + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 + fmov v5.d[1], x20 //AES block 4k+5 - mov high + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 + eor v11.16b, v11.16b, v6.16b //GHASH block 4k+3 - low + + pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid + cmp x0, x5 //.LOOP CONTROL + fmov d4, x6 //AES block 4k+4 - mov low + + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 + fmov v4.d[1], x7 //AES block 4k+4 - mov high + + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 + fmov d7, x23 //AES block 4k+3 - mov low + + eor v10.16b, v10.16b, v30.16b //GHASH block 4k+3 - mid + eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up + add w12, w12, #1 //CTR block 4k+8 + + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 + fmov v7.d[1], x24 //AES block 4k+3 - mov high + + pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid + ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment + fmov d6, x21 //AES block 4k+6 - mov low + + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 + + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 9 + eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 9 + + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 10 + eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid + + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 9 + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 9 + + aese v0.16b, v29.16b //AES block 4k+4 - round 11 + + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 10 + eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid + + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 10 + + eor v4.16b, v4.16b, v0.16b //AES block 4k+4 - result + fmov d0, x10 //CTR block 4k+8 + + aese v1.16b, v29.16b //AES block 4k+5 - round 11 + fmov v0.d[1], x9 //CTR block 4k+8 + rev w9, w12 //CTR block 4k+9 + + pmull v9.1q, v10.1d, v8.1d //MODULO - mid 64b align with low + fmov v6.d[1], x22 //AES block 4k+6 - mov high + st1 { v4.16b}, [x2], #16 //AES block 4k+4 - store result + + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 10 + orr x9, x11, x9, lsl #32 //CTR block 4k+9 + + eor v5.16b, v5.16b, v1.16b //AES block 4k+5 - result + add w12, w12, #1 //CTR block 4k+9 + fmov d1, x10 //CTR block 4k+9 + + aese v2.16b, v29.16b //AES block 4k+6 - round 11 + fmov v1.d[1], x9 //CTR block 4k+9 + rev w9, w12 //CTR block 4k+10 + + add w12, w12, #1 //CTR block 4k+10 + ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment + orr x9, x11, x9, lsl #32 //CTR block 4k+10 + + st1 { v5.16b}, [x2], #16 //AES block 4k+5 - store result + eor v11.16b, v11.16b, v9.16b //MODULO - fold into low + + aese v3.16b, v29.16b //AES block 4k+7 - round 11 + eor v6.16b, v6.16b, v2.16b //AES block 4k+6 - result + fmov d2, x10 //CTR block 4k+10 + + st1 { v6.16b}, [x2], #16 //AES block 4k+6 - store result + fmov v2.d[1], x9 //CTR block 4k+10 + rev w9, w12 //CTR block 4k+11 + + eor v11.16b, v11.16b, v10.16b //MODULO - fold into low + orr x9, x11, x9, lsl #32 //CTR block 4k+11 + + eor v7.16b, v7.16b, v3.16b //AES block 4k+3 - result + st1 { v7.16b}, [x2], #16 //AES block 4k+3 - store result + b.lt .L192_enc_main_loop + +.L192_enc_prepretail: //PREPRETAIL + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 + rev64 v4.16b, v4.16b //GHASH block 4k (only t0 is free) + + fmov d3, x10 //CTR block 4k+3 + ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 + add w12, w12, #1 //CTR block 4k+3 + + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 + rev64 v5.16b, v5.16b //GHASH block 4k+1 (t0 and t1 free) + + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 + + fmov v3.d[1], x9 //CTR block 4k+3 + eor v4.16b, v4.16b, v11.16b //PRE 1 + mov d10, v17.d[1] //GHASH block 4k - mid + + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 + rev64 v6.16b, v6.16b //GHASH block 4k+2 (t0, t1, and t2 free) + + pmull2 v30.1q, v5.2d, v14.2d //GHASH block 4k+1 - high + + pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low + mov d8, v4.d[1] //GHASH block 4k - mid + + pmull v31.1q, v5.1d, v14.1d //GHASH block 4k+1 - low + rev64 v7.16b, v7.16b //GHASH block 4k+3 (t0, t1, t2 and t3 free) + + pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high + + eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid + mov d4, v5.d[1] //GHASH block 4k+1 - mid + + eor v11.16b, v11.16b, v31.16b //GHASH block 4k+1 - low + mov d31, v6.d[1] //GHASH block 4k+2 - mid + + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 + eor v9.16b, v9.16b, v30.16b //GHASH block 4k+1 - high + + pmull2 v30.1q, v6.2d, v13.2d //GHASH block 4k+2 - high + + eor v4.8b, v4.8b, v5.8b //GHASH block 4k+1 - mid + eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid + + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 + + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 + eor v9.16b, v9.16b, v30.16b //GHASH block 4k+2 - high + + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 + + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 + mov d30, v7.d[1] //GHASH block 4k+3 - mid + + pmull2 v5.1q, v7.2d, v12.2d //GHASH block 4k+3 - high + ins v31.d[1], v31.d[0] //GHASH block 4k+2 - mid + + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 + + pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid + eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid + + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 + + pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid + + pmull v4.1q, v4.1d, v17.1d //GHASH block 4k+1 - mid + + pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid + eor v9.16b, v9.16b, v5.16b //GHASH block 4k+3 - high + + pmull v8.1q, v6.1d, v13.1d //GHASH block 4k+2 - low + + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 + eor v10.16b, v10.16b, v4.16b //GHASH block 4k+1 - mid + + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 + + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 + eor v11.16b, v11.16b, v8.16b //GHASH block 4k+2 - low + + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 + + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 + eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid + + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 + + pmull v6.1q, v7.1d, v12.1d //GHASH block 4k+3 - low + movi v8.8b, #0xc2 + + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 + + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 + + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 + eor v10.16b, v10.16b, v30.16b //GHASH block 4k+3 - mid + + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 + + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 + + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 + eor v11.16b, v11.16b, v6.16b //GHASH block 4k+3 - low + + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 + + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 + eor v10.16b, v10.16b, v9.16b //karatsuba tidy up + + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 + + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 + shl d8, d8, #56 //mod_constant + + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 + + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 + eor v10.16b, v10.16b, v11.16b + + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 + + pmull v30.1q, v9.1d, v8.1d + + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 + ext v9.16b, v9.16b, v9.16b, #8 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 + + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 + eor v10.16b, v10.16b, v30.16b + + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 + + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 9 + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 + eor v10.16b, v10.16b, v9.16b + + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 9 + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 9 + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 9 + + pmull v30.1q, v10.1d, v8.1d + + ext v10.16b, v10.16b, v10.16b, #8 + + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 10 + + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 10 + + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 10 + + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 10 + eor v11.16b, v11.16b, v30.16b + + aese v0.16b, v29.16b //AES block 4k+4 - round 11 + + aese v3.16b, v29.16b //AES block 4k+7 - round 11 + + aese v2.16b, v29.16b //AES block 4k+6 - round 11 + + aese v1.16b, v29.16b //AES block 4k+5 - round 11 + eor v11.16b, v11.16b, v10.16b +.L192_enc_tail: //TAIL + + sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process + ldp x6, x7, [x0], #16 //AES block 4k+4 - load plaintext +#ifdef __AARCH64EB__ + rev x6, x6 + rev x7, x7 +#endif + eor x6, x6, x13 //AES block 4k+4 - round 12 low + eor x7, x7, x14 //AES block 4k+4 - round 12 high + + fmov d4, x6 //AES block 4k+4 - mov low + + fmov v4.d[1], x7 //AES block 4k+4 - mov high + cmp x5, #48 + + eor v5.16b, v4.16b, v0.16b //AES block 4k+4 - result + + ext v8.16b, v11.16b, v11.16b, #8 //prepare final partial tag + b.gt .L192_enc_blocks_more_than_3 + + sub w12, w12, #1 + movi v10.8b, #0 + + mov v3.16b, v2.16b + movi v9.8b, #0 + cmp x5, #32 + + mov v2.16b, v1.16b + movi v11.8b, #0 + b.gt .L192_enc_blocks_more_than_2 + + sub w12, w12, #1 + + mov v3.16b, v1.16b + cmp x5, #16 + b.gt .L192_enc_blocks_more_than_1 + + sub w12, w12, #1 + b .L192_enc_blocks_less_than_1 +.L192_enc_blocks_more_than_3: //blocks left > 3 + st1 { v5.16b}, [x2], #16 //AES final-3 block - store result + + ldp x6, x7, [x0], #16 //AES final-2 block - load input low & high +#ifdef __AARCH64EB__ + rev x6, x6 + rev x7, x7 +#endif + rev64 v4.16b, v5.16b //GHASH final-3 block + + eor x6, x6, x13 //AES final-2 block - round 12 low + eor v4.16b, v4.16b, v8.16b //feed in partial tag + + eor x7, x7, x14 //AES final-2 block - round 12 high + fmov d5, x6 //AES final-2 block - mov low + + fmov v5.d[1], x7 //AES final-2 block - mov high + + mov d22, v4.d[1] //GHASH final-3 block - mid + + pmull v11.1q, v4.1d, v15.1d //GHASH final-3 block - low + + mov d10, v17.d[1] //GHASH final-3 block - mid + + eor v22.8b, v22.8b, v4.8b //GHASH final-3 block - mid + + movi v8.8b, #0 //suppress further partial tag feed in + + pmull2 v9.1q, v4.2d, v15.2d //GHASH final-3 block - high + + pmull v10.1q, v22.1d, v10.1d //GHASH final-3 block - mid + eor v5.16b, v5.16b, v1.16b //AES final-2 block - result +.L192_enc_blocks_more_than_2: //blocks left > 2 + + st1 { v5.16b}, [x2], #16 //AES final-2 block - store result + + rev64 v4.16b, v5.16b //GHASH final-2 block + ldp x6, x7, [x0], #16 //AES final-1 block - load input low & high +#ifdef __AARCH64EB__ + rev x6, x6 + rev x7, x7 +#endif + eor v4.16b, v4.16b, v8.16b //feed in partial tag + + eor x7, x7, x14 //AES final-1 block - round 12 high + + pmull2 v20.1q, v4.2d, v14.2d //GHASH final-2 block - high + mov d22, v4.d[1] //GHASH final-2 block - mid + + pmull v21.1q, v4.1d, v14.1d //GHASH final-2 block - low + eor x6, x6, x13 //AES final-1 block - round 12 low + + fmov d5, x6 //AES final-1 block - mov low + + fmov v5.d[1], x7 //AES final-1 block - mov high + eor v9.16b, v9.16b, v20.16b //GHASH final-2 block - high + eor v22.8b, v22.8b, v4.8b //GHASH final-2 block - mid + + eor v11.16b, v11.16b, v21.16b //GHASH final-2 block - low + + pmull v22.1q, v22.1d, v17.1d //GHASH final-2 block - mid + + movi v8.8b, #0 //suppress further partial tag feed in + + eor v5.16b, v5.16b, v2.16b //AES final-1 block - result + + eor v10.16b, v10.16b, v22.16b //GHASH final-2 block - mid +.L192_enc_blocks_more_than_1: //blocks left > 1 + + st1 { v5.16b}, [x2], #16 //AES final-1 block - store result + + ldp x6, x7, [x0], #16 //AES final block - load input low & high +#ifdef __AARCH64EB__ + rev x6, x6 + rev x7, x7 +#endif + rev64 v4.16b, v5.16b //GHASH final-1 block + + eor x6, x6, x13 //AES final block - round 12 low + eor v4.16b, v4.16b, v8.16b //feed in partial tag + movi v8.8b, #0 //suppress further partial tag feed in + + mov d22, v4.d[1] //GHASH final-1 block - mid + + eor v22.8b, v22.8b, v4.8b //GHASH final-1 block - mid + eor x7, x7, x14 //AES final block - round 12 high + fmov d5, x6 //AES final block - mov low + + pmull2 v20.1q, v4.2d, v13.2d //GHASH final-1 block - high + fmov v5.d[1], x7 //AES final block - mov high + + ins v22.d[1], v22.d[0] //GHASH final-1 block - mid + + eor v9.16b, v9.16b, v20.16b //GHASH final-1 block - high + + pmull v21.1q, v4.1d, v13.1d //GHASH final-1 block - low + + pmull2 v22.1q, v22.2d, v16.2d //GHASH final-1 block - mid + + eor v5.16b, v5.16b, v3.16b //AES final block - result + + eor v11.16b, v11.16b, v21.16b //GHASH final-1 block - low + + eor v10.16b, v10.16b, v22.16b //GHASH final-1 block - mid +.L192_enc_blocks_less_than_1: //blocks left <= 1 + + ld1 { v18.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored +#ifndef __AARCH64EB__ + rev w9, w12 +#else + mov w9, w12 +#endif + and x1, x1, #127 //bit_length %= 128 + + sub x1, x1, #128 //bit_length -= 128 + mvn x14, xzr //rk12_h = 0xffffffffffffffff + + neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128]) + mvn x13, xzr //rk12_l = 0xffffffffffffffff + + and x1, x1, #127 //bit_length %= 128 + + lsr x14, x14, x1 //rk12_h is mask for top 64b of last block + cmp x1, #64 + + csel x6, x13, x14, lt + csel x7, x14, xzr, lt + + fmov d0, x6 //ctr0b is mask for last block + + fmov v0.d[1], x7 + + and v5.16b, v5.16b, v0.16b //possibly partial last block has zeroes in highest bits + + rev64 v4.16b, v5.16b //GHASH final block + + eor v4.16b, v4.16b, v8.16b //feed in partial tag + + mov d8, v4.d[1] //GHASH final block - mid + + pmull v21.1q, v4.1d, v12.1d //GHASH final block - low + + pmull2 v20.1q, v4.2d, v12.2d //GHASH final block - high + + eor v8.8b, v8.8b, v4.8b //GHASH final block - mid + + eor v11.16b, v11.16b, v21.16b //GHASH final block - low + + eor v9.16b, v9.16b, v20.16b //GHASH final block - high + + pmull v8.1q, v8.1d, v16.1d //GHASH final block - mid + + eor v10.16b, v10.16b, v8.16b //GHASH final block - mid + movi v8.8b, #0xc2 + + eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up + + shl d8, d8, #56 //mod_constant + + bif v5.16b, v18.16b, v0.16b //insert existing bytes in top end of result before storing + + eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up + + pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid + + ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment + + eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid + + eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid + + pmull v9.1q, v10.1d, v8.1d //MODULO - mid 64b align with low + + ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment + + eor v11.16b, v11.16b, v9.16b //MODULO - fold into low + str w9, [x16, #12] //store the updated counter + + st1 { v5.16b}, [x2] //store all 16B + + eor v11.16b, v11.16b, v10.16b //MODULO - fold into low + ext v11.16b, v11.16b, v11.16b, #8 + rev64 v11.16b, v11.16b + mov x0, x15 + st1 { v11.16b }, [x3] + + ldp x21, x22, [sp, #16] + ldp x23, x24, [sp, #32] + ldp d8, d9, [sp, #48] + ldp d10, d11, [sp, #64] + ldp d12, d13, [sp, #80] + ldp d14, d15, [sp, #96] + ldp x19, x20, [sp], #112 + ret + +.L192_enc_ret: + mov w0, #0x0 + ret +.size aes_gcm_enc_192_kernel,.-aes_gcm_enc_192_kernel +.globl aes_gcm_dec_192_kernel +.type aes_gcm_dec_192_kernel,%function +.align 4 +aes_gcm_dec_192_kernel: + cbz x1, .L192_dec_ret + stp x19, x20, [sp, #-112]! + mov x16, x4 + mov x8, x5 + stp x21, x22, [sp, #16] + stp x23, x24, [sp, #32] + stp d8, d9, [sp, #48] + stp d10, d11, [sp, #64] + stp d12, d13, [sp, #80] + stp d14, d15, [sp, #96] + + add x4, x0, x1, lsr #3 //end_input_ptr + ldp x10, x11, [x16] //ctr96_b64, ctr96_t32 +#ifdef __AARCH64EB__ + rev x10, x10 + rev x11, x11 +#endif + ldp x13, x14, [x8, #192] //load rk12 +#ifdef __AARCH64EB__ + ror x13, x13, #32 + ror x14, x14, #32 +#endif + ld1 { v0.16b}, [x16] //special case vector load initial counter so we can start first AES block as quickly as possible + + ld1 {v18.4s}, [x8], #16 //load rk0 + + lsr x5, x1, #3 //byte_len + mov x15, x5 + ld1 {v19.4s}, [x8], #16 //load rk1 + + lsr x12, x11, #32 + orr w11, w11, w11 + fmov d3, x10 //CTR block 3 + + rev w12, w12 //rev_ctr32 + fmov d1, x10 //CTR block 1 + + add w12, w12, #1 //increment rev_ctr32 + ld1 {v20.4s}, [x8], #16 //load rk2 + + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b //AES block 0 - round 0 + rev w9, w12 //CTR block 1 + + add w12, w12, #1 //CTR block 1 + orr x9, x11, x9, lsl #32 //CTR block 1 + ld1 {v21.4s}, [x8], #16 //load rk3 + + fmov v1.d[1], x9 //CTR block 1 + rev w9, w12 //CTR block 2 + add w12, w12, #1 //CTR block 2 + + fmov d2, x10 //CTR block 2 + orr x9, x11, x9, lsl #32 //CTR block 2 + + fmov v2.d[1], x9 //CTR block 2 + rev w9, w12 //CTR block 3 + + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b //AES block 0 - round 1 + orr x9, x11, x9, lsl #32 //CTR block 3 + + fmov v3.d[1], x9 //CTR block 3 + + ld1 {v22.4s}, [x8], #16 //load rk4 + + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b //AES block 0 - round 2 + + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b //AES block 2 - round 0 + ld1 {v23.4s}, [x8], #16 //load rk5 + + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b //AES block 1 - round 0 + ldr q15, [x3, #112] //load h4l | h4h +#ifndef __AARCH64EB__ + ext v15.16b, v15.16b, v15.16b, #8 +#endif + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b //AES block 3 - round 0 + ldr q13, [x3, #64] //load h2l | h2h +#ifndef __AARCH64EB__ + ext v13.16b, v13.16b, v13.16b, #8 +#endif + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b //AES block 2 - round 1 + ldr q14, [x3, #80] //load h3l | h3h +#ifndef __AARCH64EB__ + ext v14.16b, v14.16b, v14.16b, #8 +#endif + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b //AES block 1 - round 1 + + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b //AES block 3 - round 1 + ldr q12, [x3, #32] //load h1l | h1h +#ifndef __AARCH64EB__ + ext v12.16b, v12.16b, v12.16b, #8 +#endif + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b //AES block 2 - round 2 + ld1 {v24.4s}, [x8], #16 //load rk6 + + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b //AES block 0 - round 3 + ld1 {v25.4s}, [x8], #16 //load rk7 + + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b //AES block 1 - round 2 + ld1 {v26.4s}, [x8], #16 //load rk8 + + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b //AES block 3 - round 2 + ld1 {v27.4s}, [x8], #16 //load rk9 + + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b //AES block 2 - round 3 + ld1 { v11.16b}, [x3] + ext v11.16b, v11.16b, v11.16b, #8 + rev64 v11.16b, v11.16b + + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b //AES block 1 - round 3 + add w12, w12, #1 //CTR block 3 + + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b //AES block 3 - round 3 + trn1 v9.2d, v14.2d, v15.2d //h4h | h3h + + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b //AES block 0 - round 4 + ld1 {v28.4s}, [x8], #16 //load rk10 + + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b //AES block 1 - round 4 + trn2 v17.2d, v14.2d, v15.2d //h4l | h3l + + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b //AES block 2 - round 4 + + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b //AES block 3 - round 4 + trn2 v16.2d, v12.2d, v13.2d //h2l | h1l + + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b //AES block 0 - round 5 + ld1 {v29.4s}, [x8], #16 //load rk11 + + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b //AES block 1 - round 5 + + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b //AES block 2 - round 5 + + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b //AES block 3 - round 5 + + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b //AES block 0 - round 6 + + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b //AES block 2 - round 6 + + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b //AES block 3 - round 6 + + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b //AES block 0 - round 7 + + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b //AES block 2 - round 7 + + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b //AES block 3 - round 7 + + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b //AES block 1 - round 6 + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 2 - round 8 + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 3 - round 8 + + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b //AES block 1 - round 7 + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 2 - round 9 + + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 3 - round 9 + + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 1 - round 8 + sub x5, x5, #1 //byte_len - 1 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 0 - round 8 + and x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail) + + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 3 - round 10 + add x5, x5, x0 + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 1 - round 9 + cmp x0, x5 //check if we have <= 4 blocks + + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 0 - round 9 + trn1 v8.2d, v12.2d, v13.2d //h2h | h1h + + aese v3.16b, v29.16b //AES block 3 - round 11 + + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 2 - round 10 + + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 1 - round 10 + + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 0 - round 10 + eor v16.16b, v16.16b, v8.16b //h2k | h1k + + aese v2.16b, v29.16b //AES block 2 - round 11 + + aese v1.16b, v29.16b //AES block 1 - round 11 + eor v17.16b, v17.16b, v9.16b //h4k | h3k + + aese v0.16b, v29.16b //AES block 0 - round 11 + b.ge .L192_dec_tail //handle tail + + ld1 {v4.16b, v5.16b}, [x0], #32 //AES block 0,1 - load ciphertext + + eor v1.16b, v5.16b, v1.16b //AES block 1 - result + + eor v0.16b, v4.16b, v0.16b //AES block 0 - result + rev w9, w12 //CTR block 4 + ld1 {v6.16b, v7.16b}, [x0], #32 //AES block 2,3 - load ciphertext + + mov x19, v1.d[0] //AES block 1 - mov low + + mov x20, v1.d[1] //AES block 1 - mov high + + mov x6, v0.d[0] //AES block 0 - mov low + orr x9, x11, x9, lsl #32 //CTR block 4 + add w12, w12, #1 //CTR block 4 + + mov x7, v0.d[1] //AES block 0 - mov high + rev64 v4.16b, v4.16b //GHASH block 0 + + fmov d0, x10 //CTR block 4 + rev64 v5.16b, v5.16b //GHASH block 1 + cmp x0, x5 //check if we have <= 8 blocks + + eor x19, x19, x13 //AES block 1 - round 12 low +#ifdef __AARCH64EB__ + rev x19, x19 +#endif + fmov v0.d[1], x9 //CTR block 4 + rev w9, w12 //CTR block 5 + + orr x9, x11, x9, lsl #32 //CTR block 5 + fmov d1, x10 //CTR block 5 + eor x20, x20, x14 //AES block 1 - round 12 high +#ifdef __AARCH64EB__ + rev x20, x20 +#endif + add w12, w12, #1 //CTR block 5 + fmov v1.d[1], x9 //CTR block 5 + eor x6, x6, x13 //AES block 0 - round 12 low +#ifdef __AARCH64EB__ + rev x6, x6 +#endif + rev w9, w12 //CTR block 6 + eor x7, x7, x14 //AES block 0 - round 12 high +#ifdef __AARCH64EB__ + rev x7, x7 +#endif + stp x6, x7, [x2], #16 //AES block 0 - store result + orr x9, x11, x9, lsl #32 //CTR block 6 + + stp x19, x20, [x2], #16 //AES block 1 - store result + + add w12, w12, #1 //CTR block 6 + eor v2.16b, v6.16b, v2.16b //AES block 2 - result + b.ge .L192_dec_prepretail //do prepretail + +.L192_dec_main_loop: //main loop start + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 + ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 + + pmull v31.1q, v5.1d, v14.1d //GHASH block 4k+1 - low + mov x21, v2.d[0] //AES block 4k+2 - mov low + + mov x22, v2.d[1] //AES block 4k+2 - mov high + eor v3.16b, v7.16b, v3.16b //AES block 4k+3 - result + rev64 v7.16b, v7.16b //GHASH block 4k+3 + + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 + fmov d2, x10 //CTR block 4k+6 + + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 + eor v4.16b, v4.16b, v11.16b //PRE 1 + + pmull2 v30.1q, v5.2d, v14.2d //GHASH block 4k+1 - high + fmov v2.d[1], x9 //CTR block 4k+6 + + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 + mov x24, v3.d[1] //AES block 4k+3 - mov high + + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 + mov x23, v3.d[0] //AES block 4k+3 - mov low + + pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high + fmov d3, x10 //CTR block 4k+7 + mov d8, v4.d[1] //GHASH block 4k - mid + + pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low + mov d10, v17.d[1] //GHASH block 4k - mid + rev w9, w12 //CTR block 4k+7 + + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 + orr x9, x11, x9, lsl #32 //CTR block 4k+7 + + fmov v3.d[1], x9 //CTR block 4k+7 + eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid + mov d4, v5.d[1] //GHASH block 4k+1 - mid + + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 + + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 + eor x22, x22, x14 //AES block 4k+2 - round 12 high +#ifdef __AARCH64EB__ + rev x22, x22 +#endif + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 + eor v4.8b, v4.8b, v5.8b //GHASH block 4k+1 - mid + + pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid + + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 + rev64 v6.16b, v6.16b //GHASH block 4k+2 + + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 + + pmull v4.1q, v4.1d, v17.1d //GHASH block 4k+1 - mid + eor v11.16b, v11.16b, v31.16b //GHASH block 4k+1 - low + eor x21, x21, x13 //AES block 4k+2 - round 12 low +#ifdef __AARCH64EB__ + rev x21, x21 +#endif + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 + + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 + + eor v10.16b, v10.16b, v4.16b //GHASH block 4k+1 - mid + mov d31, v6.d[1] //GHASH block 4k+2 - mid + + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 + eor v9.16b, v9.16b, v30.16b //GHASH block 4k+1 - high + + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 + + pmull2 v30.1q, v6.2d, v13.2d //GHASH block 4k+2 - high + eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid + + pmull v8.1q, v6.1d, v13.1d //GHASH block 4k+2 - low + + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 + + eor v9.16b, v9.16b, v30.16b //GHASH block 4k+2 - high + mov d30, v7.d[1] //GHASH block 4k+3 - mid + + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 + + pmull2 v5.1q, v7.2d, v12.2d //GHASH block 4k+3 - high + + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 + eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid + + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 + + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 + ins v31.d[1], v31.d[0] //GHASH block 4k+2 - mid + + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 + + pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid + eor v11.16b, v11.16b, v8.16b //GHASH block 4k+2 - low + + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 + + pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid + eor v9.16b, v9.16b, v5.16b //GHASH block 4k+3 - high + + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 + movi v8.8b, #0xc2 + + pmull v6.1q, v7.1d, v12.1d //GHASH block 4k+3 - low + + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 + eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid + + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 + + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 9 + eor v11.16b, v11.16b, v6.16b //GHASH block 4k+3 - low + + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 + + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 + eor v10.16b, v10.16b, v30.16b //GHASH block 4k+3 - mid + + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 10 + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 9 + eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up + + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 + + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 + shl d8, d8, #56 //mod_constant + + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 10 + + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 + ld1 {v4.16b}, [x0], #16 //AES block 4k+4 - load ciphertext + + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 + eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up + + pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid + ld1 {v5.16b}, [x0], #16 //AES block 4k+5 - load ciphertext + eor x23, x23, x13 //AES block 4k+3 - round 12 low +#ifdef __AARCH64EB__ + rev x23, x23 +#endif + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 + ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment + + aese v0.16b, v29.16b //AES block 4k+4 - round 11 + add w12, w12, #1 //CTR block 4k+7 + + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 + eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 + ld1 {v6.16b}, [x0], #16 //AES block 4k+6 - load ciphertext + + aese v1.16b, v29.16b //AES block 4k+5 - round 11 + ld1 {v7.16b}, [x0], #16 //AES block 4k+7 - load ciphertext + rev w9, w12 //CTR block 4k+8 + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 + stp x21, x22, [x2], #16 //AES block 4k+2 - store result + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 9 + eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid + + cmp x0, x5 //.LOOP CONTROL + + eor v0.16b, v4.16b, v0.16b //AES block 4k+4 - result + eor x24, x24, x14 //AES block 4k+3 - round 12 high +#ifdef __AARCH64EB__ + rev x24, x24 +#endif + eor v1.16b, v5.16b, v1.16b //AES block 4k+5 - result + + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 10 + orr x9, x11, x9, lsl #32 //CTR block 4k+8 + + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 9 + + pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low + mov x19, v1.d[0] //AES block 4k+5 - mov low + + mov x6, v0.d[0] //AES block 4k+4 - mov low + stp x23, x24, [x2], #16 //AES block 4k+3 - store result + rev64 v5.16b, v5.16b //GHASH block 4k+5 + + aese v2.16b, v29.16b //AES block 4k+6 - round 11 + mov x7, v0.d[1] //AES block 4k+4 - mov high + + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 10 + mov x20, v1.d[1] //AES block 4k+5 - mov high + + fmov d0, x10 //CTR block 4k+8 + add w12, w12, #1 //CTR block 4k+8 + ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment + + eor v2.16b, v6.16b, v2.16b //AES block 4k+6 - result + fmov v0.d[1], x9 //CTR block 4k+8 + rev w9, w12 //CTR block 4k+9 + + eor x6, x6, x13 //AES block 4k+4 - round 12 low +#ifdef __AARCH64EB__ + rev x6, x6 +#endif + orr x9, x11, x9, lsl #32 //CTR block 4k+9 + eor v11.16b, v11.16b, v8.16b //MODULO - fold into low + + fmov d1, x10 //CTR block 4k+9 + add w12, w12, #1 //CTR block 4k+9 + eor x19, x19, x13 //AES block 4k+5 - round 12 low +#ifdef __AARCH64EB__ + rev x19, x19 +#endif + fmov v1.d[1], x9 //CTR block 4k+9 + rev w9, w12 //CTR block 4k+10 + eor x20, x20, x14 //AES block 4k+5 - round 12 high +#ifdef __AARCH64EB__ + rev x20, x20 +#endif + eor x7, x7, x14 //AES block 4k+4 - round 12 high +#ifdef __AARCH64EB__ + rev x7, x7 +#endif + stp x6, x7, [x2], #16 //AES block 4k+4 - store result + eor v11.16b, v11.16b, v10.16b //MODULO - fold into low + + add w12, w12, #1 //CTR block 4k+10 + rev64 v4.16b, v4.16b //GHASH block 4k+4 + orr x9, x11, x9, lsl #32 //CTR block 4k+10 + + aese v3.16b, v29.16b //AES block 4k+7 - round 11 + stp x19, x20, [x2], #16 //AES block 4k+5 - store result + b.lt .L192_dec_main_loop + +.L192_dec_prepretail: //PREPRETAIL + mov x22, v2.d[1] //AES block 4k+2 - mov high + ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 + eor v3.16b, v7.16b, v3.16b //AES block 4k+3 - result + + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 + mov x21, v2.d[0] //AES block 4k+2 - mov low + + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 + mov d10, v17.d[1] //GHASH block 4k - mid + + eor v4.16b, v4.16b, v11.16b //PRE 1 + fmov d2, x10 //CTR block 4k+6 + + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 + mov x23, v3.d[0] //AES block 4k+3 - mov low + + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 + mov x24, v3.d[1] //AES block 4k+3 - mov high + + pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low + mov d8, v4.d[1] //GHASH block 4k - mid + fmov d3, x10 //CTR block 4k+7 + + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 + rev64 v6.16b, v6.16b //GHASH block 4k+2 + + pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high + fmov v2.d[1], x9 //CTR block 4k+6 + rev w9, w12 //CTR block 4k+7 + + orr x9, x11, x9, lsl #32 //CTR block 4k+7 + eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid + mov d4, v5.d[1] //GHASH block 4k+1 - mid + + pmull v31.1q, v5.1d, v14.1d //GHASH block 4k+1 - low + eor x24, x24, x14 //AES block 4k+3 - round 12 high +#ifdef __AARCH64EB__ + rev x24, x24 +#endif + fmov v3.d[1], x9 //CTR block 4k+7 + + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 + eor x21, x21, x13 //AES block 4k+2 - round 12 low +#ifdef __AARCH64EB__ + rev x21, x21 +#endif + pmull2 v30.1q, v5.2d, v14.2d //GHASH block 4k+1 - high + eor x22, x22, x14 //AES block 4k+2 - round 12 high +#ifdef __AARCH64EB__ + rev x22, x22 +#endif + eor v4.8b, v4.8b, v5.8b //GHASH block 4k+1 - mid + + pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid + eor x23, x23, x13 //AES block 4k+3 - round 12 low +#ifdef __AARCH64EB__ + rev x23, x23 +#endif + stp x21, x22, [x2], #16 //AES block 4k+2 - store result + + rev64 v7.16b, v7.16b //GHASH block 4k+3 + stp x23, x24, [x2], #16 //AES block 4k+3 - store result + + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 + eor v9.16b, v9.16b, v30.16b //GHASH block 4k+1 - high + + pmull v4.1q, v4.1d, v17.1d //GHASH block 4k+1 - mid + add w12, w12, #1 //CTR block 4k+7 + + pmull2 v30.1q, v6.2d, v13.2d //GHASH block 4k+2 - high + eor v11.16b, v11.16b, v31.16b //GHASH block 4k+1 - low + + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 + + eor v10.16b, v10.16b, v4.16b //GHASH block 4k+1 - mid + mov d31, v6.d[1] //GHASH block 4k+2 - mid + + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 + + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 + eor v9.16b, v9.16b, v30.16b //GHASH block 4k+2 - high + + eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid + + pmull v8.1q, v6.1d, v13.1d //GHASH block 4k+2 - low + + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 + mov d30, v7.d[1] //GHASH block 4k+3 - mid + + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 + ins v31.d[1], v31.d[0] //GHASH block 4k+2 - mid + + pmull v6.1q, v7.1d, v12.1d //GHASH block 4k+3 - low + + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 + eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid + + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 + + pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid + eor v11.16b, v11.16b, v8.16b //GHASH block 4k+2 - low + + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 + + pmull2 v5.1q, v7.2d, v12.2d //GHASH block 4k+3 - high + movi v8.8b, #0xc2 + + pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid + + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 + + shl d8, d8, #56 //mod_constant + eor v9.16b, v9.16b, v5.16b //GHASH block 4k+3 - high + + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 + eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid + + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 + + pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid + eor v11.16b, v11.16b, v6.16b //GHASH block 4k+3 - low + + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 + + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 + eor v10.16b, v10.16b, v30.16b //GHASH block 4k+3 - mid + + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 + + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 + eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up + + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 + + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 + ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 + + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 + eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up + + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 + + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 + + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 9 + + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 + + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 + eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid + + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 10 + + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 + + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 + eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid + + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 9 + + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 + + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 9 + + pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 9 + + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 10 + + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 10 + ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment + + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 10 + + aese v0.16b, v29.16b + eor v11.16b, v11.16b, v8.16b //MODULO - fold into low + + aese v2.16b, v29.16b + + aese v1.16b, v29.16b + + aese v3.16b, v29.16b + + eor v11.16b, v11.16b, v10.16b //MODULO - fold into low +.L192_dec_tail: //TAIL + + sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process + ld1 { v5.16b}, [x0], #16 //AES block 4k+4 - load ciphertext + + eor v0.16b, v5.16b, v0.16b //AES block 4k+4 - result + + mov x7, v0.d[1] //AES block 4k+4 - mov high + + mov x6, v0.d[0] //AES block 4k+4 - mov low + + ext v8.16b, v11.16b, v11.16b, #8 //prepare final partial tag + + cmp x5, #48 + + eor x7, x7, x14 //AES block 4k+4 - round 12 high +#ifdef __AARCH64EB__ + rev x7, x7 +#endif + eor x6, x6, x13 //AES block 4k+4 - round 12 low +#ifdef __AARCH64EB__ + rev x6, x6 +#endif + b.gt .L192_dec_blocks_more_than_3 + + movi v11.8b, #0 + movi v9.8b, #0 + + mov v3.16b, v2.16b + mov v2.16b, v1.16b + sub w12, w12, #1 + + movi v10.8b, #0 + cmp x5, #32 + b.gt .L192_dec_blocks_more_than_2 + + mov v3.16b, v1.16b + cmp x5, #16 + sub w12, w12, #1 + + b.gt .L192_dec_blocks_more_than_1 + + sub w12, w12, #1 + b .L192_dec_blocks_less_than_1 +.L192_dec_blocks_more_than_3: //blocks left > 3 + rev64 v4.16b, v5.16b //GHASH final-3 block + ld1 { v5.16b}, [x0], #16 //AES final-2 block - load ciphertext + + stp x6, x7, [x2], #16 //AES final-3 block - store result + + eor v4.16b, v4.16b, v8.16b //feed in partial tag + + eor v0.16b, v5.16b, v1.16b //AES final-2 block - result + + pmull v11.1q, v4.1d, v15.1d //GHASH final-3 block - low + mov x6, v0.d[0] //AES final-2 block - mov low + mov d22, v4.d[1] //GHASH final-3 block - mid + + mov x7, v0.d[1] //AES final-2 block - mov high + + mov d10, v17.d[1] //GHASH final-3 block - mid + eor v22.8b, v22.8b, v4.8b //GHASH final-3 block - mid + + pmull2 v9.1q, v4.2d, v15.2d //GHASH final-3 block - high + + eor x6, x6, x13 //AES final-2 block - round 12 low +#ifdef __AARCH64EB__ + rev x6, x6 +#endif + movi v8.8b, #0 //suppress further partial tag feed in + + pmull v10.1q, v22.1d, v10.1d //GHASH final-3 block - mid + eor x7, x7, x14 //AES final-2 block - round 12 high +#ifdef __AARCH64EB__ + rev x7, x7 +#endif +.L192_dec_blocks_more_than_2: //blocks left > 2 + + rev64 v4.16b, v5.16b //GHASH final-2 block + ld1 { v5.16b}, [x0], #16 //AES final-1 block - load ciphertext + + eor v4.16b, v4.16b, v8.16b //feed in partial tag + + movi v8.8b, #0 //suppress further partial tag feed in + + eor v0.16b, v5.16b, v2.16b //AES final-1 block - result + + mov d22, v4.d[1] //GHASH final-2 block - mid + + pmull v21.1q, v4.1d, v14.1d //GHASH final-2 block - low + + stp x6, x7, [x2], #16 //AES final-2 block - store result + + eor v22.8b, v22.8b, v4.8b //GHASH final-2 block - mid + mov x7, v0.d[1] //AES final-1 block - mov high + + eor v11.16b, v11.16b, v21.16b //GHASH final-2 block - low + mov x6, v0.d[0] //AES final-1 block - mov low + + pmull2 v20.1q, v4.2d, v14.2d //GHASH final-2 block - high + + pmull v22.1q, v22.1d, v17.1d //GHASH final-2 block - mid + + eor v9.16b, v9.16b, v20.16b //GHASH final-2 block - high + eor x7, x7, x14 //AES final-1 block - round 12 high +#ifdef __AARCH64EB__ + rev x7, x7 +#endif + eor x6, x6, x13 //AES final-1 block - round 12 low +#ifdef __AARCH64EB__ + rev x6, x6 +#endif + eor v10.16b, v10.16b, v22.16b //GHASH final-2 block - mid +.L192_dec_blocks_more_than_1: //blocks left > 1 + + rev64 v4.16b, v5.16b //GHASH final-1 block + + eor v4.16b, v4.16b, v8.16b //feed in partial tag + ld1 { v5.16b}, [x0], #16 //AES final block - load ciphertext + + mov d22, v4.d[1] //GHASH final-1 block - mid + + pmull2 v20.1q, v4.2d, v13.2d //GHASH final-1 block - high + + eor v0.16b, v5.16b, v3.16b //AES final block - result + stp x6, x7, [x2], #16 //AES final-1 block - store result + + eor v22.8b, v22.8b, v4.8b //GHASH final-1 block - mid + + eor v9.16b, v9.16b, v20.16b //GHASH final-1 block - high + + pmull v21.1q, v4.1d, v13.1d //GHASH final-1 block - low + mov x7, v0.d[1] //AES final block - mov high + + ins v22.d[1], v22.d[0] //GHASH final-1 block - mid + mov x6, v0.d[0] //AES final block - mov low + + pmull2 v22.1q, v22.2d, v16.2d //GHASH final-1 block - mid + + movi v8.8b, #0 //suppress further partial tag feed in + eor v11.16b, v11.16b, v21.16b //GHASH final-1 block - low + eor x7, x7, x14 //AES final block - round 12 high +#ifdef __AARCH64EB__ + rev x7, x7 +#endif + eor x6, x6, x13 //AES final block - round 12 low +#ifdef __AARCH64EB__ + rev x6, x6 +#endif + eor v10.16b, v10.16b, v22.16b //GHASH final-1 block - mid +.L192_dec_blocks_less_than_1: //blocks left <= 1 + + mvn x13, xzr //rk12_l = 0xffffffffffffffff + ldp x4, x5, [x2] //load existing bytes we need to not overwrite + and x1, x1, #127 //bit_length %= 128 + + sub x1, x1, #128 //bit_length -= 128 + + neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128]) + + and x1, x1, #127 //bit_length %= 128 + mvn x14, xzr //rk12_h = 0xffffffffffffffff + + lsr x14, x14, x1 //rk12_h is mask for top 64b of last block + cmp x1, #64 + + csel x9, x13, x14, lt + csel x10, x14, xzr, lt + + fmov d0, x9 //ctr0b is mask for last block + and x6, x6, x9 + bic x4, x4, x9 //mask out low existing bytes + + orr x6, x6, x4 + mov v0.d[1], x10 +#ifndef __AARCH64EB__ + rev w9, w12 +#else + mov w9, w12 +#endif + + and v5.16b, v5.16b, v0.16b //possibly partial last block has zeroes in highest bits + str w9, [x16, #12] //store the updated counter + + rev64 v4.16b, v5.16b //GHASH final block + + eor v4.16b, v4.16b, v8.16b //feed in partial tag + bic x5, x5, x10 //mask out high existing bytes + + and x7, x7, x10 + + pmull2 v20.1q, v4.2d, v12.2d //GHASH final block - high + mov d8, v4.d[1] //GHASH final block - mid + + pmull v21.1q, v4.1d, v12.1d //GHASH final block - low + + eor v8.8b, v8.8b, v4.8b //GHASH final block - mid + + eor v9.16b, v9.16b, v20.16b //GHASH final block - high + + pmull v8.1q, v8.1d, v16.1d //GHASH final block - mid + + eor v11.16b, v11.16b, v21.16b //GHASH final block - low + + eor v10.16b, v10.16b, v8.16b //GHASH final block - mid + movi v8.8b, #0xc2 + + eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up + + shl d8, d8, #56 //mod_constant + + eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up + + pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid + orr x7, x7, x5 + stp x6, x7, [x2] + + ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment + + eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid + + eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid + + pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low + + eor v11.16b, v11.16b, v8.16b //MODULO - fold into low + + ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment + + eor v11.16b, v11.16b, v10.16b //MODULO - fold into low + ext v11.16b, v11.16b, v11.16b, #8 + rev64 v11.16b, v11.16b + mov x0, x15 + st1 { v11.16b }, [x3] + + ldp x21, x22, [sp, #16] + ldp x23, x24, [sp, #32] + ldp d8, d9, [sp, #48] + ldp d10, d11, [sp, #64] + ldp d12, d13, [sp, #80] + ldp d14, d15, [sp, #96] + ldp x19, x20, [sp], #112 + ret + +.L192_dec_ret: + mov w0, #0x0 + ret +.size aes_gcm_dec_192_kernel,.-aes_gcm_dec_192_kernel +.globl aes_gcm_enc_256_kernel +.type aes_gcm_enc_256_kernel,%function +.align 4 +aes_gcm_enc_256_kernel: + cbz x1, .L256_enc_ret + stp x19, x20, [sp, #-112]! + mov x16, x4 + mov x8, x5 + stp x21, x22, [sp, #16] + stp x23, x24, [sp, #32] + stp d8, d9, [sp, #48] + stp d10, d11, [sp, #64] + stp d12, d13, [sp, #80] + stp d14, d15, [sp, #96] + + add x4, x0, x1, lsr #3 //end_input_ptr + lsr x5, x1, #3 //byte_len + mov x15, x5 + ldp x10, x11, [x16] //ctr96_b64, ctr96_t32 +#ifdef __AARCH64EB__ + rev x10, x10 + rev x11, x11 +#endif + ldp x13, x14, [x8, #224] //load rk14 +#ifdef __AARCH64EB__ + ror x13, x13, #32 + ror x14, x14, #32 +#endif + ld1 { v0.16b}, [x16] //special case vector load initial counter so we can start first AES block as quickly as possible + sub x5, x5, #1 //byte_len - 1 + + ld1 {v18.4s}, [x8], #16 //load rk0 + and x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail) + + ld1 {v19.4s}, [x8], #16 //load rk1 + add x5, x5, x0 + + lsr x12, x11, #32 + fmov d2, x10 //CTR block 2 + orr w11, w11, w11 + + rev w12, w12 //rev_ctr32 + cmp x0, x5 //check if we have <= 4 blocks + fmov d1, x10 //CTR block 1 + + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b //AES block 0 - round 0 + add w12, w12, #1 //increment rev_ctr32 + + rev w9, w12 //CTR block 1 + fmov d3, x10 //CTR block 3 + + orr x9, x11, x9, lsl #32 //CTR block 1 + add w12, w12, #1 //CTR block 1 + ld1 {v20.4s}, [x8], #16 //load rk2 + + fmov v1.d[1], x9 //CTR block 1 + rev w9, w12 //CTR block 2 + add w12, w12, #1 //CTR block 2 + + orr x9, x11, x9, lsl #32 //CTR block 2 + ld1 {v21.4s}, [x8], #16 //load rk3 + + fmov v2.d[1], x9 //CTR block 2 + rev w9, w12 //CTR block 3 + + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b //AES block 0 - round 1 + orr x9, x11, x9, lsl #32 //CTR block 3 + + fmov v3.d[1], x9 //CTR block 3 + + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b //AES block 1 - round 0 + ld1 {v22.4s}, [x8], #16 //load rk4 + + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b //AES block 0 - round 2 + ld1 {v23.4s}, [x8], #16 //load rk5 + + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b //AES block 2 - round 0 + ld1 {v24.4s}, [x8], #16 //load rk6 + + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b //AES block 1 - round 1 + ldr q14, [x3, #80] //load h3l | h3h +#ifndef __AARCH64EB__ + ext v14.16b, v14.16b, v14.16b, #8 +#endif + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b //AES block 3 - round 0 + ld1 {v25.4s}, [x8], #16 //load rk7 + + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b //AES block 2 - round 1 + ld1 {v26.4s}, [x8], #16 //load rk8 + + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b //AES block 1 - round 2 + ldr q13, [x3, #64] //load h2l | h2h +#ifndef __AARCH64EB__ + ext v13.16b, v13.16b, v13.16b, #8 +#endif + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b //AES block 3 - round 1 + ld1 {v27.4s}, [x8], #16 //load rk9 + + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b //AES block 2 - round 2 + ldr q15, [x3, #112] //load h4l | h4h +#ifndef __AARCH64EB__ + ext v15.16b, v15.16b, v15.16b, #8 +#endif + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b //AES block 1 - round 3 + ld1 {v28.4s}, [x8], #16 //load rk10 + + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b //AES block 3 - round 2 + ld1 {v29.4s}, [x8], #16 //load rk11 + + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b //AES block 2 - round 3 + add w12, w12, #1 //CTR block 3 + + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b //AES block 0 - round 3 + + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b //AES block 3 - round 3 + ld1 { v11.16b}, [x3] + ext v11.16b, v11.16b, v11.16b, #8 + rev64 v11.16b, v11.16b + + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b //AES block 2 - round 4 + + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b //AES block 0 - round 4 + + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b //AES block 1 - round 4 + + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b //AES block 3 - round 4 + + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b //AES block 0 - round 5 + + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b //AES block 1 - round 5 + + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b //AES block 3 - round 5 + + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b //AES block 2 - round 5 + + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b //AES block 1 - round 6 + trn2 v17.2d, v14.2d, v15.2d //h4l | h3l + + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b //AES block 3 - round 6 + ld1 {v30.4s}, [x8], #16 //load rk12 + + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b //AES block 0 - round 6 + ldr q12, [x3, #32] //load h1l | h1h +#ifndef __AARCH64EB__ + ext v12.16b, v12.16b, v12.16b, #8 +#endif + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b //AES block 2 - round 6 + ld1 {v31.4s}, [x8], #16 //load rk13 + + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b //AES block 1 - round 7 + trn1 v9.2d, v14.2d, v15.2d //h4h | h3h + + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b //AES block 0 - round 7 + + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b //AES block 2 - round 7 + + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b //AES block 3 - round 7 + trn2 v16.2d, v12.2d, v13.2d //h2l | h1l + + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 1 - round 8 + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 2 - round 8 + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 3 - round 8 + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 1 - round 9 + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 2 - round 9 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 0 - round 8 + + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 1 - round 10 + + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 3 - round 9 + + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 0 - round 9 + + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 2 - round 10 + + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 3 - round 10 + + aese v1.16b, v29.16b + aesmc v1.16b, v1.16b //AES block 1 - round 11 + + aese v2.16b, v29.16b + aesmc v2.16b, v2.16b //AES block 2 - round 11 + + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 0 - round 10 + + aese v1.16b, v30.16b + aesmc v1.16b, v1.16b //AES block 1 - round 12 + + aese v2.16b, v30.16b + aesmc v2.16b, v2.16b //AES block 2 - round 12 + + aese v0.16b, v29.16b + aesmc v0.16b, v0.16b //AES block 0 - round 11 + eor v17.16b, v17.16b, v9.16b //h4k | h3k + + aese v3.16b, v29.16b + aesmc v3.16b, v3.16b //AES block 3 - round 11 + + aese v2.16b, v31.16b //AES block 2 - round 13 + trn1 v8.2d, v12.2d, v13.2d //h2h | h1h + + aese v0.16b, v30.16b + aesmc v0.16b, v0.16b //AES block 0 - round 12 + + aese v3.16b, v30.16b + aesmc v3.16b, v3.16b //AES block 3 - round 12 + + aese v1.16b, v31.16b //AES block 1 - round 13 + + aese v0.16b, v31.16b //AES block 0 - round 13 + + aese v3.16b, v31.16b //AES block 3 - round 13 + eor v16.16b, v16.16b, v8.16b //h2k | h1k + b.ge .L256_enc_tail //handle tail + + ldp x19, x20, [x0, #16] //AES block 1 - load plaintext +#ifdef __AARCH64EB__ + rev x19, x19 + rev x20, x20 +#endif + rev w9, w12 //CTR block 4 + ldp x6, x7, [x0, #0] //AES block 0 - load plaintext +#ifdef __AARCH64EB__ + rev x6, x6 + rev x7, x7 +#endif + ldp x23, x24, [x0, #48] //AES block 3 - load plaintext +#ifdef __AARCH64EB__ + rev x23, x23 + rev x24, x24 +#endif + ldp x21, x22, [x0, #32] //AES block 2 - load plaintext +#ifdef __AARCH64EB__ + rev x21, x21 + rev x22, x22 +#endif + add x0, x0, #64 //AES input_ptr update + + eor x19, x19, x13 //AES block 1 - round 14 low + eor x20, x20, x14 //AES block 1 - round 14 high + + fmov d5, x19 //AES block 1 - mov low + eor x6, x6, x13 //AES block 0 - round 14 low + + eor x7, x7, x14 //AES block 0 - round 14 high + eor x24, x24, x14 //AES block 3 - round 14 high + fmov d4, x6 //AES block 0 - mov low + + cmp x0, x5 //check if we have <= 8 blocks + fmov v4.d[1], x7 //AES block 0 - mov high + eor x23, x23, x13 //AES block 3 - round 14 low + + eor x21, x21, x13 //AES block 2 - round 14 low + fmov v5.d[1], x20 //AES block 1 - mov high + + fmov d6, x21 //AES block 2 - mov low + add w12, w12, #1 //CTR block 4 + + orr x9, x11, x9, lsl #32 //CTR block 4 + fmov d7, x23 //AES block 3 - mov low + eor x22, x22, x14 //AES block 2 - round 14 high + + fmov v6.d[1], x22 //AES block 2 - mov high + + eor v4.16b, v4.16b, v0.16b //AES block 0 - result + fmov d0, x10 //CTR block 4 + + fmov v0.d[1], x9 //CTR block 4 + rev w9, w12 //CTR block 5 + add w12, w12, #1 //CTR block 5 + + eor v5.16b, v5.16b, v1.16b //AES block 1 - result + fmov d1, x10 //CTR block 5 + orr x9, x11, x9, lsl #32 //CTR block 5 + + fmov v1.d[1], x9 //CTR block 5 + rev w9, w12 //CTR block 6 + st1 { v4.16b}, [x2], #16 //AES block 0 - store result + + fmov v7.d[1], x24 //AES block 3 - mov high + orr x9, x11, x9, lsl #32 //CTR block 6 + eor v6.16b, v6.16b, v2.16b //AES block 2 - result + + st1 { v5.16b}, [x2], #16 //AES block 1 - store result + + add w12, w12, #1 //CTR block 6 + fmov d2, x10 //CTR block 6 + + fmov v2.d[1], x9 //CTR block 6 + st1 { v6.16b}, [x2], #16 //AES block 2 - store result + rev w9, w12 //CTR block 7 + + orr x9, x11, x9, lsl #32 //CTR block 7 + + eor v7.16b, v7.16b, v3.16b //AES block 3 - result + st1 { v7.16b}, [x2], #16 //AES block 3 - store result + b.ge .L256_enc_prepretail //do prepretail + +.L256_enc_main_loop: //main loop start + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 + rev64 v4.16b, v4.16b //GHASH block 4k (only t0 is free) + + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 + fmov d3, x10 //CTR block 4k+3 + + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 + ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 + + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 + fmov v3.d[1], x9 //CTR block 4k+3 + + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 + ldp x23, x24, [x0, #48] //AES block 4k+7 - load plaintext +#ifdef __AARCH64EB__ + rev x23, x23 + rev x24, x24 +#endif + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 + ldp x21, x22, [x0, #32] //AES block 4k+6 - load plaintext +#ifdef __AARCH64EB__ + rev x21, x21 + rev x22, x22 +#endif + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 + eor v4.16b, v4.16b, v11.16b //PRE 1 + + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 + + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 + eor x23, x23, x13 //AES block 4k+7 - round 14 low + + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 + mov d10, v17.d[1] //GHASH block 4k - mid + + pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high + eor x22, x22, x14 //AES block 4k+6 - round 14 high + mov d8, v4.d[1] //GHASH block 4k - mid + + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 + rev64 v5.16b, v5.16b //GHASH block 4k+1 (t0 and t1 free) + + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 + + pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low + eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid + + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 + + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 + rev64 v7.16b, v7.16b //GHASH block 4k+3 (t0, t1, t2 and t3 free) + + pmull2 v4.1q, v5.2d, v14.2d //GHASH block 4k+1 - high + + pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid + rev64 v6.16b, v6.16b //GHASH block 4k+2 (t0, t1, and t2 free) + + pmull v8.1q, v5.1d, v14.1d //GHASH block 4k+1 - low + + eor v9.16b, v9.16b, v4.16b //GHASH block 4k+1 - high + mov d4, v5.d[1] //GHASH block 4k+1 - mid + + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 + + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 + eor v11.16b, v11.16b, v8.16b //GHASH block 4k+1 - low + + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 + + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 + mov d8, v6.d[1] //GHASH block 4k+2 - mid + + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 + eor v4.8b, v4.8b, v5.8b //GHASH block 4k+1 - mid + + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 + + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 + eor v8.8b, v8.8b, v6.8b //GHASH block 4k+2 - mid + + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 + + pmull v4.1q, v4.1d, v17.1d //GHASH block 4k+1 - mid + + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 + + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 + ins v8.d[1], v8.d[0] //GHASH block 4k+2 - mid + + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 + + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 + + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 + eor v10.16b, v10.16b, v4.16b //GHASH block 4k+1 - mid + + pmull2 v4.1q, v6.2d, v13.2d //GHASH block 4k+2 - high + + pmull v5.1q, v6.1d, v13.1d //GHASH block 4k+2 - low + + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 + + pmull v6.1q, v7.1d, v12.1d //GHASH block 4k+3 - low + eor v9.16b, v9.16b, v4.16b //GHASH block 4k+2 - high + + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 + ldp x19, x20, [x0, #16] //AES block 4k+5 - load plaintext +#ifdef __AARCH64EB__ + rev x19, x19 + rev x20, x20 +#endif + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 + mov d4, v7.d[1] //GHASH block 4k+3 - mid + + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 + eor v11.16b, v11.16b, v5.16b //GHASH block 4k+2 - low + + pmull2 v8.1q, v8.2d, v16.2d //GHASH block 4k+2 - mid + + pmull2 v5.1q, v7.2d, v12.2d //GHASH block 4k+3 - high + eor v4.8b, v4.8b, v7.8b //GHASH block 4k+3 - mid + + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 + eor x19, x19, x13 //AES block 4k+5 - round 14 low + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 9 + eor v10.16b, v10.16b, v8.16b //GHASH block 4k+2 - mid + + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 + eor x21, x21, x13 //AES block 4k+6 - round 14 low + + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 9 + movi v8.8b, #0xc2 + + pmull v4.1q, v4.1d, v16.1d //GHASH block 4k+3 - mid + eor v9.16b, v9.16b, v5.16b //GHASH block 4k+3 - high + fmov d5, x19 //AES block 4k+5 - mov low + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 + ldp x6, x7, [x0, #0] //AES block 4k+4 - load plaintext +#ifdef __AARCH64EB__ + rev x6, x6 + rev x7, x7 +#endif + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 10 + shl d8, d8, #56 //mod_constant + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 + eor v11.16b, v11.16b, v6.16b //GHASH block 4k+3 - low + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 9 + + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 10 + eor v10.16b, v10.16b, v4.16b //GHASH block 4k+3 - mid + + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 9 + add w12, w12, #1 //CTR block 4k+3 + + aese v0.16b, v29.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 11 + eor v4.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up + + aese v1.16b, v29.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 11 + add x0, x0, #64 //AES input_ptr update + + pmull v7.1q, v9.1d, v8.1d //MODULO - top 64b align with mid + rev w9, w12 //CTR block 4k+8 + ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment + + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 10 + eor x6, x6, x13 //AES block 4k+4 - round 14 low + + aese v1.16b, v30.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 12 + eor v10.16b, v10.16b, v4.16b //MODULO - karatsuba tidy up + + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 10 + eor x7, x7, x14 //AES block 4k+4 - round 14 high + + fmov d4, x6 //AES block 4k+4 - mov low + orr x9, x11, x9, lsl #32 //CTR block 4k+8 + eor v7.16b, v9.16b, v7.16b //MODULO - fold into mid + + aese v0.16b, v30.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 12 + eor x20, x20, x14 //AES block 4k+5 - round 14 high + + aese v2.16b, v29.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 11 + eor x24, x24, x14 //AES block 4k+7 - round 14 high + + aese v3.16b, v29.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 11 + add w12, w12, #1 //CTR block 4k+8 + + aese v0.16b, v31.16b //AES block 4k+4 - round 13 + fmov v4.d[1], x7 //AES block 4k+4 - mov high + eor v10.16b, v10.16b, v7.16b //MODULO - fold into mid + + aese v2.16b, v30.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 12 + fmov d7, x23 //AES block 4k+7 - mov low + + aese v1.16b, v31.16b //AES block 4k+5 - round 13 + fmov v5.d[1], x20 //AES block 4k+5 - mov high + + fmov d6, x21 //AES block 4k+6 - mov low + cmp x0, x5 //.LOOP CONTROL + + fmov v6.d[1], x22 //AES block 4k+6 - mov high + + pmull v9.1q, v10.1d, v8.1d //MODULO - mid 64b align with low + eor v4.16b, v4.16b, v0.16b //AES block 4k+4 - result + fmov d0, x10 //CTR block 4k+8 + + fmov v0.d[1], x9 //CTR block 4k+8 + rev w9, w12 //CTR block 4k+9 + add w12, w12, #1 //CTR block 4k+9 + + eor v5.16b, v5.16b, v1.16b //AES block 4k+5 - result + fmov d1, x10 //CTR block 4k+9 + orr x9, x11, x9, lsl #32 //CTR block 4k+9 + + aese v3.16b, v30.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 12 + fmov v1.d[1], x9 //CTR block 4k+9 + + aese v2.16b, v31.16b //AES block 4k+6 - round 13 + rev w9, w12 //CTR block 4k+10 + st1 { v4.16b}, [x2], #16 //AES block 4k+4 - store result + + orr x9, x11, x9, lsl #32 //CTR block 4k+10 + eor v11.16b, v11.16b, v9.16b //MODULO - fold into low + fmov v7.d[1], x24 //AES block 4k+7 - mov high + + ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment + st1 { v5.16b}, [x2], #16 //AES block 4k+5 - store result + add w12, w12, #1 //CTR block 4k+10 + + aese v3.16b, v31.16b //AES block 4k+7 - round 13 + eor v6.16b, v6.16b, v2.16b //AES block 4k+6 - result + fmov d2, x10 //CTR block 4k+10 + + st1 { v6.16b}, [x2], #16 //AES block 4k+6 - store result + fmov v2.d[1], x9 //CTR block 4k+10 + rev w9, w12 //CTR block 4k+11 + + eor v11.16b, v11.16b, v10.16b //MODULO - fold into low + orr x9, x11, x9, lsl #32 //CTR block 4k+11 + + eor v7.16b, v7.16b, v3.16b //AES block 4k+7 - result + st1 { v7.16b}, [x2], #16 //AES block 4k+7 - store result + b.lt .L256_enc_main_loop + +.L256_enc_prepretail: //PREPRETAIL + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 + rev64 v6.16b, v6.16b //GHASH block 4k+2 (t0, t1, and t2 free) + + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 + fmov d3, x10 //CTR block 4k+3 + + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 + rev64 v4.16b, v4.16b //GHASH block 4k (only t0 is free) + + fmov v3.d[1], x9 //CTR block 4k+3 + ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 + + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 + + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 + + eor v4.16b, v4.16b, v11.16b //PRE 1 + rev64 v5.16b, v5.16b //GHASH block 4k+1 (t0 and t1 free) + + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 + + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 + mov d10, v17.d[1] //GHASH block 4k - mid + + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 + + pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low + mov d8, v4.d[1] //GHASH block 4k - mid + + pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high + + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 + + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 + eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid + + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 + + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 + + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 + + pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid + + pmull2 v4.1q, v5.2d, v14.2d //GHASH block 4k+1 - high + + pmull v8.1q, v5.1d, v14.1d //GHASH block 4k+1 - low + + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 + + eor v9.16b, v9.16b, v4.16b //GHASH block 4k+1 - high + mov d4, v5.d[1] //GHASH block 4k+1 - mid + + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 + eor v11.16b, v11.16b, v8.16b //GHASH block 4k+1 - low + + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 + + eor v4.8b, v4.8b, v5.8b //GHASH block 4k+1 - mid + mov d8, v6.d[1] //GHASH block 4k+2 - mid + + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 + rev64 v7.16b, v7.16b //GHASH block 4k+3 (t0, t1, t2 and t3 free) + + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 + + pmull v4.1q, v4.1d, v17.1d //GHASH block 4k+1 - mid + eor v8.8b, v8.8b, v6.8b //GHASH block 4k+2 - mid + add w12, w12, #1 //CTR block 4k+3 + + pmull v5.1q, v6.1d, v13.1d //GHASH block 4k+2 - low + + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 + + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 + eor v10.16b, v10.16b, v4.16b //GHASH block 4k+1 - mid + + pmull2 v4.1q, v6.2d, v13.2d //GHASH block 4k+2 - high + + eor v11.16b, v11.16b, v5.16b //GHASH block 4k+2 - low + ins v8.d[1], v8.d[0] //GHASH block 4k+2 - mid + + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 + + eor v9.16b, v9.16b, v4.16b //GHASH block 4k+2 - high + mov d4, v7.d[1] //GHASH block 4k+3 - mid + + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 + + pmull2 v8.1q, v8.2d, v16.2d //GHASH block 4k+2 - mid + + eor v4.8b, v4.8b, v7.8b //GHASH block 4k+3 - mid + + pmull2 v5.1q, v7.2d, v12.2d //GHASH block 4k+3 - high + + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 + + pmull v4.1q, v4.1d, v16.1d //GHASH block 4k+3 - mid + eor v10.16b, v10.16b, v8.16b //GHASH block 4k+2 - mid + + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 + + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 + + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 + + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 + movi v8.8b, #0xc2 + + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 + + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 + eor v9.16b, v9.16b, v5.16b //GHASH block 4k+3 - high + + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 + + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 + shl d8, d8, #56 //mod_constant + + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 + eor v10.16b, v10.16b, v4.16b //GHASH block 4k+3 - mid + + pmull v6.1q, v7.1d, v12.1d //GHASH block 4k+3 - low + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 9 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 + eor v11.16b, v11.16b, v6.16b //GHASH block 4k+3 - low + + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 9 + + eor v10.16b, v10.16b, v9.16b //karatsuba tidy up + + pmull v4.1q, v9.1d, v8.1d + ext v9.16b, v9.16b, v9.16b, #8 + + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 10 + + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 + eor v10.16b, v10.16b, v11.16b + + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 10 + + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 9 + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 + + aese v1.16b, v29.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 11 + eor v10.16b, v10.16b, v4.16b + + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 10 + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 9 + + aese v1.16b, v30.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 12 + + aese v0.16b, v29.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 11 + eor v10.16b, v10.16b, v9.16b + + aese v3.16b, v29.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 11 + + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 10 + + aese v0.16b, v30.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 12 + + pmull v4.1q, v10.1d, v8.1d + + aese v2.16b, v29.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 11 + ext v10.16b, v10.16b, v10.16b, #8 + + aese v3.16b, v30.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 12 + + aese v1.16b, v31.16b //AES block 4k+5 - round 13 + eor v11.16b, v11.16b, v4.16b + + aese v2.16b, v30.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 12 + + aese v3.16b, v31.16b //AES block 4k+7 - round 13 + + aese v0.16b, v31.16b //AES block 4k+4 - round 13 + + aese v2.16b, v31.16b //AES block 4k+6 - round 13 + eor v11.16b, v11.16b, v10.16b +.L256_enc_tail: //TAIL + + ext v8.16b, v11.16b, v11.16b, #8 //prepare final partial tag + sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process + ldp x6, x7, [x0], #16 //AES block 4k+4 - load plaintext +#ifdef __AARCH64EB__ + rev x6, x6 + rev x7, x7 +#endif + eor x6, x6, x13 //AES block 4k+4 - round 14 low + eor x7, x7, x14 //AES block 4k+4 - round 14 high + + cmp x5, #48 + fmov d4, x6 //AES block 4k+4 - mov low + + fmov v4.d[1], x7 //AES block 4k+4 - mov high + + eor v5.16b, v4.16b, v0.16b //AES block 4k+4 - result + b.gt .L256_enc_blocks_more_than_3 + + cmp x5, #32 + mov v3.16b, v2.16b + movi v11.8b, #0 + + movi v9.8b, #0 + sub w12, w12, #1 + + mov v2.16b, v1.16b + movi v10.8b, #0 + b.gt .L256_enc_blocks_more_than_2 + + mov v3.16b, v1.16b + sub w12, w12, #1 + cmp x5, #16 + + b.gt .L256_enc_blocks_more_than_1 + + sub w12, w12, #1 + b .L256_enc_blocks_less_than_1 +.L256_enc_blocks_more_than_3: //blocks left > 3 + st1 { v5.16b}, [x2], #16 //AES final-3 block - store result + + ldp x6, x7, [x0], #16 //AES final-2 block - load input low & high +#ifdef __AARCH64EB__ + rev x6, x6 + rev x7, x7 +#endif + rev64 v4.16b, v5.16b //GHASH final-3 block + + eor x6, x6, x13 //AES final-2 block - round 14 low + eor v4.16b, v4.16b, v8.16b //feed in partial tag + + eor x7, x7, x14 //AES final-2 block - round 14 high + + mov d22, v4.d[1] //GHASH final-3 block - mid + fmov d5, x6 //AES final-2 block - mov low + + fmov v5.d[1], x7 //AES final-2 block - mov high + + eor v22.8b, v22.8b, v4.8b //GHASH final-3 block - mid + movi v8.8b, #0 //suppress further partial tag feed in + + mov d10, v17.d[1] //GHASH final-3 block - mid + + pmull v11.1q, v4.1d, v15.1d //GHASH final-3 block - low + + pmull2 v9.1q, v4.2d, v15.2d //GHASH final-3 block - high + + pmull v10.1q, v22.1d, v10.1d //GHASH final-3 block - mid + eor v5.16b, v5.16b, v1.16b //AES final-2 block - result +.L256_enc_blocks_more_than_2: //blocks left > 2 + + st1 { v5.16b}, [x2], #16 //AES final-2 block - store result + + ldp x6, x7, [x0], #16 //AES final-1 block - load input low & high +#ifdef __AARCH64EB__ + rev x6, x6 + rev x7, x7 +#endif + rev64 v4.16b, v5.16b //GHASH final-2 block + + eor x6, x6, x13 //AES final-1 block - round 14 low + eor v4.16b, v4.16b, v8.16b //feed in partial tag + + fmov d5, x6 //AES final-1 block - mov low + eor x7, x7, x14 //AES final-1 block - round 14 high + + fmov v5.d[1], x7 //AES final-1 block - mov high + + movi v8.8b, #0 //suppress further partial tag feed in + + pmull2 v20.1q, v4.2d, v14.2d //GHASH final-2 block - high + mov d22, v4.d[1] //GHASH final-2 block - mid + + pmull v21.1q, v4.1d, v14.1d //GHASH final-2 block - low + + eor v22.8b, v22.8b, v4.8b //GHASH final-2 block - mid + + eor v5.16b, v5.16b, v2.16b //AES final-1 block - result + + eor v9.16b, v9.16b, v20.16b //GHASH final-2 block - high + + pmull v22.1q, v22.1d, v17.1d //GHASH final-2 block - mid + + eor v11.16b, v11.16b, v21.16b //GHASH final-2 block - low + + eor v10.16b, v10.16b, v22.16b //GHASH final-2 block - mid +.L256_enc_blocks_more_than_1: //blocks left > 1 + + st1 { v5.16b}, [x2], #16 //AES final-1 block - store result + + rev64 v4.16b, v5.16b //GHASH final-1 block + + ldp x6, x7, [x0], #16 //AES final block - load input low & high +#ifdef __AARCH64EB__ + rev x6, x6 + rev x7, x7 +#endif + eor v4.16b, v4.16b, v8.16b //feed in partial tag + + movi v8.8b, #0 //suppress further partial tag feed in + + eor x6, x6, x13 //AES final block - round 14 low + mov d22, v4.d[1] //GHASH final-1 block - mid + + pmull2 v20.1q, v4.2d, v13.2d //GHASH final-1 block - high + eor x7, x7, x14 //AES final block - round 14 high + + eor v22.8b, v22.8b, v4.8b //GHASH final-1 block - mid + + eor v9.16b, v9.16b, v20.16b //GHASH final-1 block - high + + ins v22.d[1], v22.d[0] //GHASH final-1 block - mid + fmov d5, x6 //AES final block - mov low + + fmov v5.d[1], x7 //AES final block - mov high + + pmull2 v22.1q, v22.2d, v16.2d //GHASH final-1 block - mid + + pmull v21.1q, v4.1d, v13.1d //GHASH final-1 block - low + + eor v5.16b, v5.16b, v3.16b //AES final block - result + eor v10.16b, v10.16b, v22.16b //GHASH final-1 block - mid + + eor v11.16b, v11.16b, v21.16b //GHASH final-1 block - low +.L256_enc_blocks_less_than_1: //blocks left <= 1 + + and x1, x1, #127 //bit_length %= 128 + + mvn x13, xzr //rk14_l = 0xffffffffffffffff + sub x1, x1, #128 //bit_length -= 128 + + neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128]) + ld1 { v18.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored + + mvn x14, xzr //rk14_h = 0xffffffffffffffff + and x1, x1, #127 //bit_length %= 128 + + lsr x14, x14, x1 //rk14_h is mask for top 64b of last block + cmp x1, #64 + + csel x6, x13, x14, lt + csel x7, x14, xzr, lt + + fmov d0, x6 //ctr0b is mask for last block + + fmov v0.d[1], x7 + + and v5.16b, v5.16b, v0.16b //possibly partial last block has zeroes in highest bits + + rev64 v4.16b, v5.16b //GHASH final block + + eor v4.16b, v4.16b, v8.16b //feed in partial tag + + bif v5.16b, v18.16b, v0.16b //insert existing bytes in top end of result before storing + + pmull2 v20.1q, v4.2d, v12.2d //GHASH final block - high + mov d8, v4.d[1] //GHASH final block - mid +#ifndef __AARCH64EB__ + rev w9, w12 +#else + mov w9, w12 +#endif + + pmull v21.1q, v4.1d, v12.1d //GHASH final block - low + + eor v9.16b, v9.16b, v20.16b //GHASH final block - high + eor v8.8b, v8.8b, v4.8b //GHASH final block - mid + + pmull v8.1q, v8.1d, v16.1d //GHASH final block - mid + + eor v11.16b, v11.16b, v21.16b //GHASH final block - low + + eor v10.16b, v10.16b, v8.16b //GHASH final block - mid + movi v8.8b, #0xc2 + + eor v4.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up + + shl d8, d8, #56 //mod_constant + + eor v10.16b, v10.16b, v4.16b //MODULO - karatsuba tidy up + + pmull v7.1q, v9.1d, v8.1d //MODULO - top 64b align with mid + + ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment + + eor v10.16b, v10.16b, v7.16b //MODULO - fold into mid + + eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid + + pmull v9.1q, v10.1d, v8.1d //MODULO - mid 64b align with low + + ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment + + str w9, [x16, #12] //store the updated counter + + st1 { v5.16b}, [x2] //store all 16B + eor v11.16b, v11.16b, v9.16b //MODULO - fold into low + + eor v11.16b, v11.16b, v10.16b //MODULO - fold into low + ext v11.16b, v11.16b, v11.16b, #8 + rev64 v11.16b, v11.16b + mov x0, x15 + st1 { v11.16b }, [x3] + + ldp x21, x22, [sp, #16] + ldp x23, x24, [sp, #32] + ldp d8, d9, [sp, #48] + ldp d10, d11, [sp, #64] + ldp d12, d13, [sp, #80] + ldp d14, d15, [sp, #96] + ldp x19, x20, [sp], #112 + ret + +.L256_enc_ret: + mov w0, #0x0 + ret +.size aes_gcm_enc_256_kernel,.-aes_gcm_enc_256_kernel +.globl aes_gcm_dec_256_kernel +.type aes_gcm_dec_256_kernel,%function +.align 4 +aes_gcm_dec_256_kernel: + cbz x1, .L256_dec_ret + stp x19, x20, [sp, #-112]! + mov x16, x4 + mov x8, x5 + stp x21, x22, [sp, #16] + stp x23, x24, [sp, #32] + stp d8, d9, [sp, #48] + stp d10, d11, [sp, #64] + stp d12, d13, [sp, #80] + stp d14, d15, [sp, #96] + + lsr x5, x1, #3 //byte_len + mov x15, x5 + ldp x10, x11, [x16] //ctr96_b64, ctr96_t32 +#ifdef __AARCH64EB__ + rev x10, x10 + rev x11, x11 +#endif + ldp x13, x14, [x8, #224] //load rk14 +#ifdef __AARCH64EB__ + ror x14, x14, #32 + ror x13, x13, #32 +#endif + ld1 {v18.4s}, [x8], #16 //load rk0 + sub x5, x5, #1 //byte_len - 1 + + ld1 {v19.4s}, [x8], #16 //load rk1 + and x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail) + + add x4, x0, x1, lsr #3 //end_input_ptr + ld1 {v20.4s}, [x8], #16 //load rk2 + + lsr x12, x11, #32 + ld1 {v21.4s}, [x8], #16 //load rk3 + orr w11, w11, w11 + + ld1 {v22.4s}, [x8], #16 //load rk4 + add x5, x5, x0 + rev w12, w12 //rev_ctr32 + + add w12, w12, #1 //increment rev_ctr32 + fmov d3, x10 //CTR block 3 + + rev w9, w12 //CTR block 1 + add w12, w12, #1 //CTR block 1 + fmov d1, x10 //CTR block 1 + + orr x9, x11, x9, lsl #32 //CTR block 1 + ld1 { v0.16b}, [x16] //special case vector load initial counter so we can start first AES block as quickly as possible + + fmov v1.d[1], x9 //CTR block 1 + rev w9, w12 //CTR block 2 + add w12, w12, #1 //CTR block 2 + + fmov d2, x10 //CTR block 2 + orr x9, x11, x9, lsl #32 //CTR block 2 + + fmov v2.d[1], x9 //CTR block 2 + rev w9, w12 //CTR block 3 + + orr x9, x11, x9, lsl #32 //CTR block 3 + ld1 {v23.4s}, [x8], #16 //load rk5 + + fmov v3.d[1], x9 //CTR block 3 + add w12, w12, #1 //CTR block 3 + + ld1 {v24.4s}, [x8], #16 //load rk6 + + ld1 {v25.4s}, [x8], #16 //load rk7 + + ld1 {v26.4s}, [x8], #16 //load rk8 + + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b //AES block 0 - round 0 + ldr q14, [x3, #80] //load h3l | h3h +#ifndef __AARCH64EB__ + ext v14.16b, v14.16b, v14.16b, #8 +#endif + + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b //AES block 3 - round 0 + ldr q15, [x3, #112] //load h4l | h4h +#ifndef __AARCH64EB__ + ext v15.16b, v15.16b, v15.16b, #8 +#endif + + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b //AES block 1 - round 0 + ldr q13, [x3, #64] //load h2l | h2h +#ifndef __AARCH64EB__ + ext v13.16b, v13.16b, v13.16b, #8 +#endif + + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b //AES block 2 - round 0 + ld1 {v27.4s}, [x8], #16 //load rk9 + + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b //AES block 0 - round 1 + + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b //AES block 1 - round 1 + ld1 { v11.16b}, [x3] + ext v11.16b, v11.16b, v11.16b, #8 + rev64 v11.16b, v11.16b + + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b //AES block 2 - round 1 + ld1 {v28.4s}, [x8], #16 //load rk10 + + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b //AES block 3 - round 1 + ld1 {v29.4s}, [x8], #16 //load rk11 + + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b //AES block 0 - round 2 + ldr q12, [x3, #32] //load h1l | h1h +#ifndef __AARCH64EB__ + ext v12.16b, v12.16b, v12.16b, #8 +#endif + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b //AES block 2 - round 2 + ld1 {v30.4s}, [x8], #16 //load rk12 + + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b //AES block 3 - round 2 + + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b //AES block 0 - round 3 + + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b //AES block 1 - round 2 + + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b //AES block 3 - round 3 + + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b //AES block 0 - round 4 + cmp x0, x5 //check if we have <= 4 blocks + + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b //AES block 2 - round 3 + + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b //AES block 1 - round 3 + + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b //AES block 3 - round 4 + + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b //AES block 2 - round 4 + + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b //AES block 1 - round 4 + + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b //AES block 3 - round 5 + + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b //AES block 0 - round 5 + + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b //AES block 1 - round 5 + + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b //AES block 2 - round 5 + + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b //AES block 0 - round 6 + + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b //AES block 3 - round 6 + + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b //AES block 1 - round 6 + + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b //AES block 2 - round 6 + + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b //AES block 0 - round 7 + + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b //AES block 1 - round 7 + + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b //AES block 3 - round 7 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 0 - round 8 + + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b //AES block 2 - round 7 + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 3 - round 8 + + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 1 - round 8 + + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 0 - round 9 + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 2 - round 8 + ld1 {v31.4s}, [x8], #16 //load rk13 + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 1 - round 9 + + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 0 - round 10 + + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 3 - round 9 + + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 1 - round 10 + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 2 - round 9 + + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 3 - round 10 + + aese v0.16b, v29.16b + aesmc v0.16b, v0.16b //AES block 0 - round 11 + + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 2 - round 10 + + aese v3.16b, v29.16b + aesmc v3.16b, v3.16b //AES block 3 - round 11 + + aese v1.16b, v29.16b + aesmc v1.16b, v1.16b //AES block 1 - round 11 + + aese v2.16b, v29.16b + aesmc v2.16b, v2.16b //AES block 2 - round 11 + + trn1 v9.2d, v14.2d, v15.2d //h4h | h3h + + trn2 v17.2d, v14.2d, v15.2d //h4l | h3l + + trn1 v8.2d, v12.2d, v13.2d //h2h | h1h + trn2 v16.2d, v12.2d, v13.2d //h2l | h1l + + aese v1.16b, v30.16b + aesmc v1.16b, v1.16b //AES block 1 - round 12 + + aese v0.16b, v30.16b + aesmc v0.16b, v0.16b //AES block 0 - round 12 + + aese v2.16b, v30.16b + aesmc v2.16b, v2.16b //AES block 2 - round 12 + + aese v3.16b, v30.16b + aesmc v3.16b, v3.16b //AES block 3 - round 12 + eor v17.16b, v17.16b, v9.16b //h4k | h3k + + aese v1.16b, v31.16b //AES block 1 - round 13 + + aese v2.16b, v31.16b //AES block 2 - round 13 + eor v16.16b, v16.16b, v8.16b //h2k | h1k + + aese v3.16b, v31.16b //AES block 3 - round 13 + + aese v0.16b, v31.16b //AES block 0 - round 13 + b.ge .L256_dec_tail //handle tail + + ld1 {v4.16b, v5.16b}, [x0], #32 //AES block 0,1 - load ciphertext + + rev w9, w12 //CTR block 4 + + eor v0.16b, v4.16b, v0.16b //AES block 0 - result + + eor v1.16b, v5.16b, v1.16b //AES block 1 - result + rev64 v5.16b, v5.16b //GHASH block 1 + ld1 {v6.16b}, [x0], #16 //AES block 2 - load ciphertext + + mov x7, v0.d[1] //AES block 0 - mov high + + mov x6, v0.d[0] //AES block 0 - mov low + rev64 v4.16b, v4.16b //GHASH block 0 + add w12, w12, #1 //CTR block 4 + + fmov d0, x10 //CTR block 4 + orr x9, x11, x9, lsl #32 //CTR block 4 + + fmov v0.d[1], x9 //CTR block 4 + rev w9, w12 //CTR block 5 + add w12, w12, #1 //CTR block 5 + + mov x19, v1.d[0] //AES block 1 - mov low + + orr x9, x11, x9, lsl #32 //CTR block 5 + mov x20, v1.d[1] //AES block 1 - mov high + eor x7, x7, x14 //AES block 0 - round 14 high +#ifdef __AARCH64EB__ + rev x7, x7 +#endif + eor x6, x6, x13 //AES block 0 - round 14 low +#ifdef __AARCH64EB__ + rev x6, x6 +#endif + stp x6, x7, [x2], #16 //AES block 0 - store result + fmov d1, x10 //CTR block 5 + + ld1 {v7.16b}, [x0], #16 //AES block 3 - load ciphertext + + fmov v1.d[1], x9 //CTR block 5 + rev w9, w12 //CTR block 6 + add w12, w12, #1 //CTR block 6 + + eor x19, x19, x13 //AES block 1 - round 14 low +#ifdef __AARCH64EB__ + rev x19, x19 +#endif + orr x9, x11, x9, lsl #32 //CTR block 6 + + eor x20, x20, x14 //AES block 1 - round 14 high +#ifdef __AARCH64EB__ + rev x20, x20 +#endif + stp x19, x20, [x2], #16 //AES block 1 - store result + + eor v2.16b, v6.16b, v2.16b //AES block 2 - result + cmp x0, x5 //check if we have <= 8 blocks + b.ge .L256_dec_prepretail //do prepretail + +.L256_dec_main_loop: //main loop start + mov x21, v2.d[0] //AES block 4k+2 - mov low + ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 + eor v3.16b, v7.16b, v3.16b //AES block 4k+3 - result + + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 + mov x22, v2.d[1] //AES block 4k+2 - mov high + + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 + fmov d2, x10 //CTR block 4k+6 + + fmov v2.d[1], x9 //CTR block 4k+6 + eor v4.16b, v4.16b, v11.16b //PRE 1 + rev w9, w12 //CTR block 4k+7 + + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 + mov x24, v3.d[1] //AES block 4k+3 - mov high + + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 + mov x23, v3.d[0] //AES block 4k+3 - mov low + + pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high + mov d8, v4.d[1] //GHASH block 4k - mid + fmov d3, x10 //CTR block 4k+7 + + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 + orr x9, x11, x9, lsl #32 //CTR block 4k+7 + + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 + fmov v3.d[1], x9 //CTR block 4k+7 + + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 + eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid + + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 + eor x22, x22, x14 //AES block 4k+2 - round 14 high +#ifdef __AARCH64EB__ + rev x22, x22 +#endif + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 + mov d10, v17.d[1] //GHASH block 4k - mid + + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 + rev64 v6.16b, v6.16b //GHASH block 4k+2 + + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 + eor x21, x21, x13 //AES block 4k+2 - round 14 low +#ifdef __AARCH64EB__ + rev x21, x21 +#endif + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 + stp x21, x22, [x2], #16 //AES block 4k+2 - store result + + pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low + + pmull2 v4.1q, v5.2d, v14.2d //GHASH block 4k+1 - high + + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 + rev64 v7.16b, v7.16b //GHASH block 4k+3 + + pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid + eor x23, x23, x13 //AES block 4k+3 - round 14 low +#ifdef __AARCH64EB__ + rev x23, x23 +#endif + pmull v8.1q, v5.1d, v14.1d //GHASH block 4k+1 - low + eor x24, x24, x14 //AES block 4k+3 - round 14 high +#ifdef __AARCH64EB__ + rev x24, x24 +#endif + eor v9.16b, v9.16b, v4.16b //GHASH block 4k+1 - high + + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 + + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 + mov d4, v5.d[1] //GHASH block 4k+1 - mid + + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 + eor v11.16b, v11.16b, v8.16b //GHASH block 4k+1 - low + + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 + add w12, w12, #1 //CTR block 4k+7 + + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 + mov d8, v6.d[1] //GHASH block 4k+2 - mid + + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 + eor v4.8b, v4.8b, v5.8b //GHASH block 4k+1 - mid + + pmull v5.1q, v6.1d, v13.1d //GHASH block 4k+2 - low + + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 + eor v8.8b, v8.8b, v6.8b //GHASH block 4k+2 - mid + + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 + + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 + eor v11.16b, v11.16b, v5.16b //GHASH block 4k+2 - low + + pmull v4.1q, v4.1d, v17.1d //GHASH block 4k+1 - mid + rev w9, w12 //CTR block 4k+8 + + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 + ins v8.d[1], v8.d[0] //GHASH block 4k+2 - mid + + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 + add w12, w12, #1 //CTR block 4k+8 + + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 + + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 + eor v10.16b, v10.16b, v4.16b //GHASH block 4k+1 - mid + + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 + + pmull2 v4.1q, v6.2d, v13.2d //GHASH block 4k+2 - high + mov d6, v7.d[1] //GHASH block 4k+3 - mid + + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 + + pmull2 v8.1q, v8.2d, v16.2d //GHASH block 4k+2 - mid + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 + eor v9.16b, v9.16b, v4.16b //GHASH block 4k+2 - high + + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 + + pmull v4.1q, v7.1d, v12.1d //GHASH block 4k+3 - low + orr x9, x11, x9, lsl #32 //CTR block 4k+8 + eor v10.16b, v10.16b, v8.16b //GHASH block 4k+2 - mid + + pmull2 v5.1q, v7.2d, v12.2d //GHASH block 4k+3 - high + + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 9 + eor v6.8b, v6.8b, v7.8b //GHASH block 4k+3 - mid + + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 + + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 + eor v9.16b, v9.16b, v5.16b //GHASH block 4k+3 - high + + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 10 + + pmull v6.1q, v6.1d, v16.1d //GHASH block 4k+3 - mid + movi v8.8b, #0xc2 + + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 + eor v11.16b, v11.16b, v4.16b //GHASH block 4k+3 - low + + aese v0.16b, v29.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 11 + + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 + shl d8, d8, #56 //mod_constant + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 + eor v10.16b, v10.16b, v6.16b //GHASH block 4k+3 - mid + + aese v0.16b, v30.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 12 + + pmull v7.1q, v9.1d, v8.1d //MODULO - top 64b align with mid + eor v6.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 9 + ld1 {v4.16b}, [x0], #16 //AES block 4k+4 - load ciphertext + + aese v0.16b, v31.16b //AES block 4k+4 - round 13 + ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment + + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 10 + eor v10.16b, v10.16b, v6.16b //MODULO - karatsuba tidy up + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 9 + ld1 {v5.16b}, [x0], #16 //AES block 4k+5 - load ciphertext + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 + eor v0.16b, v4.16b, v0.16b //AES block 4k+4 - result + + aese v1.16b, v29.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 11 + stp x23, x24, [x2], #16 //AES block 4k+3 - store result + + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 10 + eor v10.16b, v10.16b, v7.16b //MODULO - fold into mid + + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 9 + ld1 {v6.16b}, [x0], #16 //AES block 4k+6 - load ciphertext + + aese v1.16b, v30.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 12 + ld1 {v7.16b}, [x0], #16 //AES block 4k+7 - load ciphertext + + aese v2.16b, v29.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 11 + mov x7, v0.d[1] //AES block 4k+4 - mov high + + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 10 + eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid + + aese v1.16b, v31.16b //AES block 4k+5 - round 13 + mov x6, v0.d[0] //AES block 4k+4 - mov low + + aese v2.16b, v30.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 12 + fmov d0, x10 //CTR block 4k+8 + + aese v3.16b, v29.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 11 + fmov v0.d[1], x9 //CTR block 4k+8 + + pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low + eor v1.16b, v5.16b, v1.16b //AES block 4k+5 - result + rev w9, w12 //CTR block 4k+9 + + aese v2.16b, v31.16b //AES block 4k+6 - round 13 + orr x9, x11, x9, lsl #32 //CTR block 4k+9 + cmp x0, x5 //.LOOP CONTROL + + add w12, w12, #1 //CTR block 4k+9 + + eor x6, x6, x13 //AES block 4k+4 - round 14 low +#ifdef __AARCH64EB__ + rev x6, x6 +#endif + eor x7, x7, x14 //AES block 4k+4 - round 14 high +#ifdef __AARCH64EB__ + rev x7, x7 +#endif + mov x20, v1.d[1] //AES block 4k+5 - mov high + eor v2.16b, v6.16b, v2.16b //AES block 4k+6 - result + eor v11.16b, v11.16b, v8.16b //MODULO - fold into low + + aese v3.16b, v30.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 12 + mov x19, v1.d[0] //AES block 4k+5 - mov low + + fmov d1, x10 //CTR block 4k+9 + ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment + + fmov v1.d[1], x9 //CTR block 4k+9 + rev w9, w12 //CTR block 4k+10 + add w12, w12, #1 //CTR block 4k+10 + + aese v3.16b, v31.16b //AES block 4k+7 - round 13 + orr x9, x11, x9, lsl #32 //CTR block 4k+10 + + rev64 v5.16b, v5.16b //GHASH block 4k+5 + eor x20, x20, x14 //AES block 4k+5 - round 14 high +#ifdef __AARCH64EB__ + rev x20, x20 +#endif + stp x6, x7, [x2], #16 //AES block 4k+4 - store result + + eor x19, x19, x13 //AES block 4k+5 - round 14 low +#ifdef __AARCH64EB__ + rev x19, x19 +#endif + stp x19, x20, [x2], #16 //AES block 4k+5 - store result + + rev64 v4.16b, v4.16b //GHASH block 4k+4 + eor v11.16b, v11.16b, v10.16b //MODULO - fold into low + b.lt .L256_dec_main_loop + + +.L256_dec_prepretail: //PREPRETAIL + ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 + mov x21, v2.d[0] //AES block 4k+2 - mov low + eor v3.16b, v7.16b, v3.16b //AES block 4k+3 - result + + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 + mov x22, v2.d[1] //AES block 4k+2 - mov high + + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 + fmov d2, x10 //CTR block 4k+6 + + fmov v2.d[1], x9 //CTR block 4k+6 + rev w9, w12 //CTR block 4k+7 + eor v4.16b, v4.16b, v11.16b //PRE 1 + + rev64 v6.16b, v6.16b //GHASH block 4k+2 + orr x9, x11, x9, lsl #32 //CTR block 4k+7 + mov x23, v3.d[0] //AES block 4k+3 - mov low + + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 + mov x24, v3.d[1] //AES block 4k+3 - mov high + + pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low + mov d8, v4.d[1] //GHASH block 4k - mid + fmov d3, x10 //CTR block 4k+7 + + pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high + fmov v3.d[1], x9 //CTR block 4k+7 + + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 + mov d10, v17.d[1] //GHASH block 4k - mid + + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 + eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid + + pmull2 v4.1q, v5.2d, v14.2d //GHASH block 4k+1 - high + + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 + rev64 v7.16b, v7.16b //GHASH block 4k+3 + + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 + + pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid + eor v9.16b, v9.16b, v4.16b //GHASH block 4k+1 - high + + pmull v8.1q, v5.1d, v14.1d //GHASH block 4k+1 - low + + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 + mov d4, v5.d[1] //GHASH block 4k+1 - mid + + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 + + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 + eor v11.16b, v11.16b, v8.16b //GHASH block 4k+1 - low + + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 + + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 + mov d8, v6.d[1] //GHASH block 4k+2 - mid + + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 + eor v4.8b, v4.8b, v5.8b //GHASH block 4k+1 - mid + + pmull v5.1q, v6.1d, v13.1d //GHASH block 4k+2 - low + + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 + + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 + eor v8.8b, v8.8b, v6.8b //GHASH block 4k+2 - mid + + pmull v4.1q, v4.1d, v17.1d //GHASH block 4k+1 - mid + + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 + eor v11.16b, v11.16b, v5.16b //GHASH block 4k+2 - low + + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 + + pmull2 v5.1q, v7.2d, v12.2d //GHASH block 4k+3 - high + eor v10.16b, v10.16b, v4.16b //GHASH block 4k+1 - mid + + pmull2 v4.1q, v6.2d, v13.2d //GHASH block 4k+2 - high + + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 + ins v8.d[1], v8.d[0] //GHASH block 4k+2 - mid + + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 + + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 + eor v9.16b, v9.16b, v4.16b //GHASH block 4k+2 - high + + pmull v4.1q, v7.1d, v12.1d //GHASH block 4k+3 - low + + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 + mov d6, v7.d[1] //GHASH block 4k+3 - mid + + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 + + pmull2 v8.1q, v8.2d, v16.2d //GHASH block 4k+2 - mid + + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 + eor v6.8b, v6.8b, v7.8b //GHASH block 4k+3 - mid + + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 + + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 + eor v10.16b, v10.16b, v8.16b //GHASH block 4k+2 - mid + + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 + + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 + movi v8.8b, #0xc2 + + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 + eor v11.16b, v11.16b, v4.16b //GHASH block 4k+3 - low + + pmull v6.1q, v6.1d, v16.1d //GHASH block 4k+3 - mid + + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 + eor v9.16b, v9.16b, v5.16b //GHASH block 4k+3 - high + + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 + + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 + eor v10.16b, v10.16b, v6.16b //GHASH block 4k+3 - mid + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 + + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 + eor v6.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up + + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 + shl d8, d8, #56 //mod_constant + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 9 + eor v10.16b, v10.16b, v6.16b //MODULO - karatsuba tidy up + + pmull v7.1q, v9.1d, v8.1d //MODULO - top 64b align with mid + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 9 + ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment + + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 9 + + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 9 + eor v10.16b, v10.16b, v7.16b //MODULO - fold into mid + + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 10 + + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 10 + + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 10 + eor x22, x22, x14 //AES block 4k+2 - round 14 high +#ifdef __AARCH64EB__ + rev x22, x22 +#endif + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 10 + eor x23, x23, x13 //AES block 4k+3 - round 14 low +#ifdef __AARCH64EB__ + rev x23, x23 +#endif + aese v2.16b, v29.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 11 + eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid + + aese v0.16b, v29.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 11 + add w12, w12, #1 //CTR block 4k+7 + + aese v1.16b, v29.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 11 + eor x21, x21, x13 //AES block 4k+2 - round 14 low +#ifdef __AARCH64EB__ + rev x21, x21 +#endif + + aese v2.16b, v30.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 12 + + pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low + eor x24, x24, x14 //AES block 4k+3 - round 14 high +#ifdef __AARCH64EB__ + rev x24, x24 +#endif + + aese v3.16b, v29.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 11 + stp x21, x22, [x2], #16 //AES block 4k+2 - store result + + aese v1.16b, v30.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 12 + ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment + + aese v0.16b, v30.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 12 + stp x23, x24, [x2], #16 //AES block 4k+3 - store result + + aese v3.16b, v30.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 12 + eor v11.16b, v11.16b, v8.16b //MODULO - fold into low + + aese v1.16b, v31.16b //AES block 4k+5 - round 13 + + aese v0.16b, v31.16b //AES block 4k+4 - round 13 + + aese v3.16b, v31.16b //AES block 4k+7 - round 13 + + aese v2.16b, v31.16b //AES block 4k+6 - round 13 + eor v11.16b, v11.16b, v10.16b //MODULO - fold into low +.L256_dec_tail: //TAIL + + sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process + ld1 { v5.16b}, [x0], #16 //AES block 4k+4 - load ciphertext + + eor v0.16b, v5.16b, v0.16b //AES block 4k+4 - result + + mov x6, v0.d[0] //AES block 4k+4 - mov low + + mov x7, v0.d[1] //AES block 4k+4 - mov high + ext v8.16b, v11.16b, v11.16b, #8 //prepare final partial tag + + cmp x5, #48 + + eor x6, x6, x13 //AES block 4k+4 - round 14 low +#ifdef __AARCH64EB__ + rev x6, x6 +#endif + + eor x7, x7, x14 //AES block 4k+4 - round 14 high +#ifdef __AARCH64EB__ + rev x7, x7 +#endif + b.gt .L256_dec_blocks_more_than_3 + + sub w12, w12, #1 + mov v3.16b, v2.16b + movi v10.8b, #0 + + movi v11.8b, #0 + cmp x5, #32 + + movi v9.8b, #0 + mov v2.16b, v1.16b + b.gt .L256_dec_blocks_more_than_2 + + sub w12, w12, #1 + + mov v3.16b, v1.16b + cmp x5, #16 + b.gt .L256_dec_blocks_more_than_1 + + sub w12, w12, #1 + b .L256_dec_blocks_less_than_1 +.L256_dec_blocks_more_than_3: //blocks left > 3 + rev64 v4.16b, v5.16b //GHASH final-3 block + ld1 { v5.16b}, [x0], #16 //AES final-2 block - load ciphertext + + stp x6, x7, [x2], #16 //AES final-3 block - store result + + mov d10, v17.d[1] //GHASH final-3 block - mid + + eor v4.16b, v4.16b, v8.16b //feed in partial tag + + eor v0.16b, v5.16b, v1.16b //AES final-2 block - result + + mov d22, v4.d[1] //GHASH final-3 block - mid + + mov x6, v0.d[0] //AES final-2 block - mov low + + mov x7, v0.d[1] //AES final-2 block - mov high + + eor v22.8b, v22.8b, v4.8b //GHASH final-3 block - mid + + movi v8.8b, #0 //suppress further partial tag feed in + + pmull2 v9.1q, v4.2d, v15.2d //GHASH final-3 block - high + + pmull v10.1q, v22.1d, v10.1d //GHASH final-3 block - mid + eor x6, x6, x13 //AES final-2 block - round 14 low +#ifdef __AARCH64EB__ + rev x6, x6 +#endif + + pmull v11.1q, v4.1d, v15.1d //GHASH final-3 block - low + eor x7, x7, x14 //AES final-2 block - round 14 high +#ifdef __AARCH64EB__ + rev x7, x7 +#endif +.L256_dec_blocks_more_than_2: //blocks left > 2 + + rev64 v4.16b, v5.16b //GHASH final-2 block + ld1 { v5.16b}, [x0], #16 //AES final-1 block - load ciphertext + + eor v4.16b, v4.16b, v8.16b //feed in partial tag + stp x6, x7, [x2], #16 //AES final-2 block - store result + + eor v0.16b, v5.16b, v2.16b //AES final-1 block - result + + mov d22, v4.d[1] //GHASH final-2 block - mid + + pmull v21.1q, v4.1d, v14.1d //GHASH final-2 block - low + + pmull2 v20.1q, v4.2d, v14.2d //GHASH final-2 block - high + + eor v22.8b, v22.8b, v4.8b //GHASH final-2 block - mid + mov x6, v0.d[0] //AES final-1 block - mov low + + mov x7, v0.d[1] //AES final-1 block - mov high + eor v11.16b, v11.16b, v21.16b //GHASH final-2 block - low + movi v8.8b, #0 //suppress further partial tag feed in + + pmull v22.1q, v22.1d, v17.1d //GHASH final-2 block - mid + + eor v9.16b, v9.16b, v20.16b //GHASH final-2 block - high + eor x6, x6, x13 //AES final-1 block - round 14 low +#ifdef __AARCH64EB__ + rev x6, x6 +#endif + + eor v10.16b, v10.16b, v22.16b //GHASH final-2 block - mid + eor x7, x7, x14 //AES final-1 block - round 14 high +#ifdef __AARCH64EB__ + rev x7, x7 +#endif +.L256_dec_blocks_more_than_1: //blocks left > 1 + + stp x6, x7, [x2], #16 //AES final-1 block - store result + rev64 v4.16b, v5.16b //GHASH final-1 block + + ld1 { v5.16b}, [x0], #16 //AES final block - load ciphertext + + eor v4.16b, v4.16b, v8.16b //feed in partial tag + movi v8.8b, #0 //suppress further partial tag feed in + + mov d22, v4.d[1] //GHASH final-1 block - mid + + eor v0.16b, v5.16b, v3.16b //AES final block - result + + pmull2 v20.1q, v4.2d, v13.2d //GHASH final-1 block - high + + eor v22.8b, v22.8b, v4.8b //GHASH final-1 block - mid + + pmull v21.1q, v4.1d, v13.1d //GHASH final-1 block - low + mov x6, v0.d[0] //AES final block - mov low + + ins v22.d[1], v22.d[0] //GHASH final-1 block - mid + + mov x7, v0.d[1] //AES final block - mov high + + pmull2 v22.1q, v22.2d, v16.2d //GHASH final-1 block - mid + eor x6, x6, x13 //AES final block - round 14 low +#ifdef __AARCH64EB__ + rev x6, x6 +#endif + eor v11.16b, v11.16b, v21.16b //GHASH final-1 block - low + + eor v9.16b, v9.16b, v20.16b //GHASH final-1 block - high + + eor v10.16b, v10.16b, v22.16b //GHASH final-1 block - mid + eor x7, x7, x14 //AES final block - round 14 high +#ifdef __AARCH64EB__ + rev x7, x7 +#endif +.L256_dec_blocks_less_than_1: //blocks left <= 1 + + and x1, x1, #127 //bit_length %= 128 + mvn x14, xzr //rk14_h = 0xffffffffffffffff + + sub x1, x1, #128 //bit_length -= 128 + mvn x13, xzr //rk14_l = 0xffffffffffffffff + + ldp x4, x5, [x2] //load existing bytes we need to not overwrite + neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128]) + + and x1, x1, #127 //bit_length %= 128 + + lsr x14, x14, x1 //rk14_h is mask for top 64b of last block + cmp x1, #64 + + csel x9, x13, x14, lt + csel x10, x14, xzr, lt + + fmov d0, x9 //ctr0b is mask for last block + and x6, x6, x9 + + mov v0.d[1], x10 + bic x4, x4, x9 //mask out low existing bytes + +#ifndef __AARCH64EB__ + rev w9, w12 +#else + mov w9, w12 +#endif + + bic x5, x5, x10 //mask out high existing bytes + + orr x6, x6, x4 + + and x7, x7, x10 + + orr x7, x7, x5 + + and v5.16b, v5.16b, v0.16b //possibly partial last block has zeroes in highest bits + + rev64 v4.16b, v5.16b //GHASH final block + + eor v4.16b, v4.16b, v8.16b //feed in partial tag + + pmull v21.1q, v4.1d, v12.1d //GHASH final block - low + + mov d8, v4.d[1] //GHASH final block - mid + + eor v8.8b, v8.8b, v4.8b //GHASH final block - mid + + pmull2 v20.1q, v4.2d, v12.2d //GHASH final block - high + + pmull v8.1q, v8.1d, v16.1d //GHASH final block - mid + + eor v9.16b, v9.16b, v20.16b //GHASH final block - high + + eor v11.16b, v11.16b, v21.16b //GHASH final block - low + + eor v10.16b, v10.16b, v8.16b //GHASH final block - mid + movi v8.8b, #0xc2 + + eor v6.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up + + shl d8, d8, #56 //mod_constant + + eor v10.16b, v10.16b, v6.16b //MODULO - karatsuba tidy up + + pmull v7.1q, v9.1d, v8.1d //MODULO - top 64b align with mid + + ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment + + eor v10.16b, v10.16b, v7.16b //MODULO - fold into mid + + eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid + + pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low + + ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment + + eor v11.16b, v11.16b, v8.16b //MODULO - fold into low + + stp x6, x7, [x2] + + str w9, [x16, #12] //store the updated counter + + eor v11.16b, v11.16b, v10.16b //MODULO - fold into low + ext v11.16b, v11.16b, v11.16b, #8 + rev64 v11.16b, v11.16b + mov x0, x15 + st1 { v11.16b }, [x3] + + ldp x21, x22, [sp, #16] + ldp x23, x24, [sp, #32] + ldp d8, d9, [sp, #48] + ldp d10, d11, [sp, #64] + ldp d12, d13, [sp, #80] + ldp d14, d15, [sp, #96] + ldp x19, x20, [sp], #112 + ret + +.L256_dec_ret: + mov w0, #0x0 + ret +.size aes_gcm_dec_256_kernel,.-aes_gcm_dec_256_kernel +.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 +#endif diff --git a/sys/crypto/openssl/aarch64/aesv8-armx.S b/sys/crypto/openssl/aarch64/aesv8-armx.S --- a/sys/crypto/openssl/aarch64/aesv8-armx.S +++ b/sys/crypto/openssl/aarch64/aesv8-armx.S @@ -2,6 +2,7 @@ #include "arm_arch.h" #if __ARM_MAX_ARCH__>=7 +.arch armv8-a+crypto .text .align 5 .Lrcon: @@ -280,6 +281,752 @@ st1 {v2.16b},[x1] ret .size aes_v8_decrypt,.-aes_v8_decrypt +.globl aes_v8_ecb_encrypt +.type aes_v8_ecb_encrypt,%function +.align 5 +aes_v8_ecb_encrypt: + subs x2,x2,#16 + // Original input data size bigger than 16, jump to big size processing. + b.ne .Lecb_big_size + ld1 {v0.16b},[x0] + cmp w4,#0 // en- or decrypting? + ldr w5,[x3,#240] + ld1 {v5.4s,v6.4s},[x3],#32 // load key schedule... + + b.eq .Lecb_small_dec + aese v0.16b,v5.16b + aesmc v0.16b,v0.16b + ld1 {v16.4s,v17.4s},[x3],#32 // load key schedule... + aese v0.16b,v6.16b + aesmc v0.16b,v0.16b + subs w5,w5,#10 // if rounds==10, jump to aes-128-ecb processing + b.eq .Lecb_128_enc +.Lecb_round_loop: + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + ld1 {v16.4s},[x3],#16 // load key schedule... + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + ld1 {v17.4s},[x3],#16 // load key schedule... + subs w5,w5,#2 // bias + b.gt .Lecb_round_loop +.Lecb_128_enc: + ld1 {v18.4s,v19.4s},[x3],#32 // load key schedule... + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + ld1 {v20.4s,v21.4s},[x3],#32 // load key schedule... + aese v0.16b,v18.16b + aesmc v0.16b,v0.16b + aese v0.16b,v19.16b + aesmc v0.16b,v0.16b + ld1 {v22.4s,v23.4s},[x3],#32 // load key schedule... + aese v0.16b,v20.16b + aesmc v0.16b,v0.16b + aese v0.16b,v21.16b + aesmc v0.16b,v0.16b + ld1 {v7.4s},[x3] + aese v0.16b,v22.16b + aesmc v0.16b,v0.16b + aese v0.16b,v23.16b + eor v0.16b,v0.16b,v7.16b + st1 {v0.16b},[x1] + b .Lecb_Final_abort +.Lecb_small_dec: + aesd v0.16b,v5.16b + aesimc v0.16b,v0.16b + ld1 {v16.4s,v17.4s},[x3],#32 // load key schedule... + aesd v0.16b,v6.16b + aesimc v0.16b,v0.16b + subs w5,w5,#10 // bias + b.eq .Lecb_128_dec +.Lecb_dec_round_loop: + aesd v0.16b,v16.16b + aesimc v0.16b,v0.16b + ld1 {v16.4s},[x3],#16 // load key schedule... + aesd v0.16b,v17.16b + aesimc v0.16b,v0.16b + ld1 {v17.4s},[x3],#16 // load key schedule... + subs w5,w5,#2 // bias + b.gt .Lecb_dec_round_loop +.Lecb_128_dec: + ld1 {v18.4s,v19.4s},[x3],#32 // load key schedule... + aesd v0.16b,v16.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v17.16b + aesimc v0.16b,v0.16b + ld1 {v20.4s,v21.4s},[x3],#32 // load key schedule... + aesd v0.16b,v18.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v19.16b + aesimc v0.16b,v0.16b + ld1 {v22.4s,v23.4s},[x3],#32 // load key schedule... + aesd v0.16b,v20.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v21.16b + aesimc v0.16b,v0.16b + ld1 {v7.4s},[x3] + aesd v0.16b,v22.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v23.16b + eor v0.16b,v0.16b,v7.16b + st1 {v0.16b},[x1] + b .Lecb_Final_abort +.Lecb_big_size: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + mov x8,#16 + b.lo .Lecb_done + csel x8,xzr,x8,eq + + cmp w4,#0 // en- or decrypting? + ldr w5,[x3,#240] + and x2,x2,#-16 + ld1 {v0.16b},[x0],x8 + + ld1 {v16.4s,v17.4s},[x3] // load key schedule... + sub w5,w5,#6 + add x7,x3,x5,lsl#4 // pointer to last 7 round keys + sub w5,w5,#2 + ld1 {v18.4s,v19.4s},[x7],#32 + ld1 {v20.4s,v21.4s},[x7],#32 + ld1 {v22.4s,v23.4s},[x7],#32 + ld1 {v7.4s},[x7] + + add x7,x3,#32 + mov w6,w5 + b.eq .Lecb_dec + + ld1 {v1.16b},[x0],#16 + subs x2,x2,#32 // bias + add w6,w5,#2 + orr v3.16b,v1.16b,v1.16b + orr v24.16b,v1.16b,v1.16b + orr v1.16b,v0.16b,v0.16b + b.lo .Lecb_enc_tail + + orr v1.16b,v3.16b,v3.16b + ld1 {v24.16b},[x0],#16 + cmp x2,#32 + b.lo .Loop3x_ecb_enc + + ld1 {v25.16b},[x0],#16 + ld1 {v26.16b},[x0],#16 + sub x2,x2,#32 // bias + mov w6,w5 + +.Loop5x_ecb_enc: + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + aese v1.16b,v16.16b + aesmc v1.16b,v1.16b + aese v24.16b,v16.16b + aesmc v24.16b,v24.16b + aese v25.16b,v16.16b + aesmc v25.16b,v25.16b + aese v26.16b,v16.16b + aesmc v26.16b,v26.16b + ld1 {v16.4s},[x7],#16 + subs w6,w6,#2 + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + aese v1.16b,v17.16b + aesmc v1.16b,v1.16b + aese v24.16b,v17.16b + aesmc v24.16b,v24.16b + aese v25.16b,v17.16b + aesmc v25.16b,v25.16b + aese v26.16b,v17.16b + aesmc v26.16b,v26.16b + ld1 {v17.4s},[x7],#16 + b.gt .Loop5x_ecb_enc + + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + aese v1.16b,v16.16b + aesmc v1.16b,v1.16b + aese v24.16b,v16.16b + aesmc v24.16b,v24.16b + aese v25.16b,v16.16b + aesmc v25.16b,v25.16b + aese v26.16b,v16.16b + aesmc v26.16b,v26.16b + cmp x2,#0x40 // because .Lecb_enc_tail4x + sub x2,x2,#0x50 + + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + aese v1.16b,v17.16b + aesmc v1.16b,v1.16b + aese v24.16b,v17.16b + aesmc v24.16b,v24.16b + aese v25.16b,v17.16b + aesmc v25.16b,v25.16b + aese v26.16b,v17.16b + aesmc v26.16b,v26.16b + csel x6,xzr,x2,gt // borrow x6, w6, "gt" is not typo + mov x7,x3 + + aese v0.16b,v18.16b + aesmc v0.16b,v0.16b + aese v1.16b,v18.16b + aesmc v1.16b,v1.16b + aese v24.16b,v18.16b + aesmc v24.16b,v24.16b + aese v25.16b,v18.16b + aesmc v25.16b,v25.16b + aese v26.16b,v18.16b + aesmc v26.16b,v26.16b + add x0,x0,x6 // x0 is adjusted in such way that + // at exit from the loop v1.16b-v26.16b + // are loaded with last "words" + add x6,x2,#0x60 // because .Lecb_enc_tail4x + + aese v0.16b,v19.16b + aesmc v0.16b,v0.16b + aese v1.16b,v19.16b + aesmc v1.16b,v1.16b + aese v24.16b,v19.16b + aesmc v24.16b,v24.16b + aese v25.16b,v19.16b + aesmc v25.16b,v25.16b + aese v26.16b,v19.16b + aesmc v26.16b,v26.16b + + aese v0.16b,v20.16b + aesmc v0.16b,v0.16b + aese v1.16b,v20.16b + aesmc v1.16b,v1.16b + aese v24.16b,v20.16b + aesmc v24.16b,v24.16b + aese v25.16b,v20.16b + aesmc v25.16b,v25.16b + aese v26.16b,v20.16b + aesmc v26.16b,v26.16b + + aese v0.16b,v21.16b + aesmc v0.16b,v0.16b + aese v1.16b,v21.16b + aesmc v1.16b,v1.16b + aese v24.16b,v21.16b + aesmc v24.16b,v24.16b + aese v25.16b,v21.16b + aesmc v25.16b,v25.16b + aese v26.16b,v21.16b + aesmc v26.16b,v26.16b + + aese v0.16b,v22.16b + aesmc v0.16b,v0.16b + aese v1.16b,v22.16b + aesmc v1.16b,v1.16b + aese v24.16b,v22.16b + aesmc v24.16b,v24.16b + aese v25.16b,v22.16b + aesmc v25.16b,v25.16b + aese v26.16b,v22.16b + aesmc v26.16b,v26.16b + + aese v0.16b,v23.16b + ld1 {v2.16b},[x0],#16 + aese v1.16b,v23.16b + ld1 {v3.16b},[x0],#16 + aese v24.16b,v23.16b + ld1 {v27.16b},[x0],#16 + aese v25.16b,v23.16b + ld1 {v28.16b},[x0],#16 + aese v26.16b,v23.16b + ld1 {v29.16b},[x0],#16 + cbz x6,.Lecb_enc_tail4x + ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] + eor v4.16b,v7.16b,v0.16b + orr v0.16b,v2.16b,v2.16b + eor v5.16b,v7.16b,v1.16b + orr v1.16b,v3.16b,v3.16b + eor v17.16b,v7.16b,v24.16b + orr v24.16b,v27.16b,v27.16b + eor v30.16b,v7.16b,v25.16b + orr v25.16b,v28.16b,v28.16b + eor v31.16b,v7.16b,v26.16b + st1 {v4.16b},[x1],#16 + orr v26.16b,v29.16b,v29.16b + st1 {v5.16b},[x1],#16 + mov w6,w5 + st1 {v17.16b},[x1],#16 + ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] + st1 {v30.16b},[x1],#16 + st1 {v31.16b},[x1],#16 + b.hs .Loop5x_ecb_enc + + add x2,x2,#0x50 + cbz x2,.Lecb_done + + add w6,w5,#2 + subs x2,x2,#0x30 + orr v0.16b,v27.16b,v27.16b + orr v1.16b,v28.16b,v28.16b + orr v24.16b,v29.16b,v29.16b + b.lo .Lecb_enc_tail + + b .Loop3x_ecb_enc + +.align 4 +.Lecb_enc_tail4x: + eor v5.16b,v7.16b,v1.16b + eor v17.16b,v7.16b,v24.16b + eor v30.16b,v7.16b,v25.16b + eor v31.16b,v7.16b,v26.16b + st1 {v5.16b},[x1],#16 + st1 {v17.16b},[x1],#16 + st1 {v30.16b},[x1],#16 + st1 {v31.16b},[x1],#16 + + b .Lecb_done +.align 4 +.Loop3x_ecb_enc: + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + aese v1.16b,v16.16b + aesmc v1.16b,v1.16b + aese v24.16b,v16.16b + aesmc v24.16b,v24.16b + ld1 {v16.4s},[x7],#16 + subs w6,w6,#2 + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + aese v1.16b,v17.16b + aesmc v1.16b,v1.16b + aese v24.16b,v17.16b + aesmc v24.16b,v24.16b + ld1 {v17.4s},[x7],#16 + b.gt .Loop3x_ecb_enc + + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + aese v1.16b,v16.16b + aesmc v1.16b,v1.16b + aese v24.16b,v16.16b + aesmc v24.16b,v24.16b + subs x2,x2,#0x30 + csel x6,x2,x6,lo // x6, w6, is zero at this point + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + aese v1.16b,v17.16b + aesmc v1.16b,v1.16b + aese v24.16b,v17.16b + aesmc v24.16b,v24.16b + add x0,x0,x6 // x0 is adjusted in such way that + // at exit from the loop v1.16b-v24.16b + // are loaded with last "words" + mov x7,x3 + aese v0.16b,v20.16b + aesmc v0.16b,v0.16b + aese v1.16b,v20.16b + aesmc v1.16b,v1.16b + aese v24.16b,v20.16b + aesmc v24.16b,v24.16b + ld1 {v2.16b},[x0],#16 + aese v0.16b,v21.16b + aesmc v0.16b,v0.16b + aese v1.16b,v21.16b + aesmc v1.16b,v1.16b + aese v24.16b,v21.16b + aesmc v24.16b,v24.16b + ld1 {v3.16b},[x0],#16 + aese v0.16b,v22.16b + aesmc v0.16b,v0.16b + aese v1.16b,v22.16b + aesmc v1.16b,v1.16b + aese v24.16b,v22.16b + aesmc v24.16b,v24.16b + ld1 {v27.16b},[x0],#16 + aese v0.16b,v23.16b + aese v1.16b,v23.16b + aese v24.16b,v23.16b + ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] + add w6,w5,#2 + eor v4.16b,v7.16b,v0.16b + eor v5.16b,v7.16b,v1.16b + eor v24.16b,v24.16b,v7.16b + ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] + st1 {v4.16b},[x1],#16 + orr v0.16b,v2.16b,v2.16b + st1 {v5.16b},[x1],#16 + orr v1.16b,v3.16b,v3.16b + st1 {v24.16b},[x1],#16 + orr v24.16b,v27.16b,v27.16b + b.hs .Loop3x_ecb_enc + + cmn x2,#0x30 + b.eq .Lecb_done + nop + +.Lecb_enc_tail: + aese v1.16b,v16.16b + aesmc v1.16b,v1.16b + aese v24.16b,v16.16b + aesmc v24.16b,v24.16b + ld1 {v16.4s},[x7],#16 + subs w6,w6,#2 + aese v1.16b,v17.16b + aesmc v1.16b,v1.16b + aese v24.16b,v17.16b + aesmc v24.16b,v24.16b + ld1 {v17.4s},[x7],#16 + b.gt .Lecb_enc_tail + + aese v1.16b,v16.16b + aesmc v1.16b,v1.16b + aese v24.16b,v16.16b + aesmc v24.16b,v24.16b + aese v1.16b,v17.16b + aesmc v1.16b,v1.16b + aese v24.16b,v17.16b + aesmc v24.16b,v24.16b + aese v1.16b,v20.16b + aesmc v1.16b,v1.16b + aese v24.16b,v20.16b + aesmc v24.16b,v24.16b + cmn x2,#0x20 + aese v1.16b,v21.16b + aesmc v1.16b,v1.16b + aese v24.16b,v21.16b + aesmc v24.16b,v24.16b + aese v1.16b,v22.16b + aesmc v1.16b,v1.16b + aese v24.16b,v22.16b + aesmc v24.16b,v24.16b + aese v1.16b,v23.16b + aese v24.16b,v23.16b + b.eq .Lecb_enc_one + eor v5.16b,v7.16b,v1.16b + eor v17.16b,v7.16b,v24.16b + st1 {v5.16b},[x1],#16 + st1 {v17.16b},[x1],#16 + b .Lecb_done + +.Lecb_enc_one: + eor v5.16b,v7.16b,v24.16b + st1 {v5.16b},[x1],#16 + b .Lecb_done +.align 5 +.Lecb_dec: + ld1 {v1.16b},[x0],#16 + subs x2,x2,#32 // bias + add w6,w5,#2 + orr v3.16b,v1.16b,v1.16b + orr v24.16b,v1.16b,v1.16b + orr v1.16b,v0.16b,v0.16b + b.lo .Lecb_dec_tail + + orr v1.16b,v3.16b,v3.16b + ld1 {v24.16b},[x0],#16 + cmp x2,#32 + b.lo .Loop3x_ecb_dec + + ld1 {v25.16b},[x0],#16 + ld1 {v26.16b},[x0],#16 + sub x2,x2,#32 // bias + mov w6,w5 + +.Loop5x_ecb_dec: + aesd v0.16b,v16.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v16.16b + aesimc v1.16b,v1.16b + aesd v24.16b,v16.16b + aesimc v24.16b,v24.16b + aesd v25.16b,v16.16b + aesimc v25.16b,v25.16b + aesd v26.16b,v16.16b + aesimc v26.16b,v26.16b + ld1 {v16.4s},[x7],#16 + subs w6,w6,#2 + aesd v0.16b,v17.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v17.16b + aesimc v1.16b,v1.16b + aesd v24.16b,v17.16b + aesimc v24.16b,v24.16b + aesd v25.16b,v17.16b + aesimc v25.16b,v25.16b + aesd v26.16b,v17.16b + aesimc v26.16b,v26.16b + ld1 {v17.4s},[x7],#16 + b.gt .Loop5x_ecb_dec + + aesd v0.16b,v16.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v16.16b + aesimc v1.16b,v1.16b + aesd v24.16b,v16.16b + aesimc v24.16b,v24.16b + aesd v25.16b,v16.16b + aesimc v25.16b,v25.16b + aesd v26.16b,v16.16b + aesimc v26.16b,v26.16b + cmp x2,#0x40 // because .Lecb_tail4x + sub x2,x2,#0x50 + + aesd v0.16b,v17.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v17.16b + aesimc v1.16b,v1.16b + aesd v24.16b,v17.16b + aesimc v24.16b,v24.16b + aesd v25.16b,v17.16b + aesimc v25.16b,v25.16b + aesd v26.16b,v17.16b + aesimc v26.16b,v26.16b + csel x6,xzr,x2,gt // borrow x6, w6, "gt" is not typo + mov x7,x3 + + aesd v0.16b,v18.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v18.16b + aesimc v1.16b,v1.16b + aesd v24.16b,v18.16b + aesimc v24.16b,v24.16b + aesd v25.16b,v18.16b + aesimc v25.16b,v25.16b + aesd v26.16b,v18.16b + aesimc v26.16b,v26.16b + add x0,x0,x6 // x0 is adjusted in such way that + // at exit from the loop v1.16b-v26.16b + // are loaded with last "words" + add x6,x2,#0x60 // because .Lecb_tail4x + + aesd v0.16b,v19.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v19.16b + aesimc v1.16b,v1.16b + aesd v24.16b,v19.16b + aesimc v24.16b,v24.16b + aesd v25.16b,v19.16b + aesimc v25.16b,v25.16b + aesd v26.16b,v19.16b + aesimc v26.16b,v26.16b + + aesd v0.16b,v20.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v20.16b + aesimc v1.16b,v1.16b + aesd v24.16b,v20.16b + aesimc v24.16b,v24.16b + aesd v25.16b,v20.16b + aesimc v25.16b,v25.16b + aesd v26.16b,v20.16b + aesimc v26.16b,v26.16b + + aesd v0.16b,v21.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v21.16b + aesimc v1.16b,v1.16b + aesd v24.16b,v21.16b + aesimc v24.16b,v24.16b + aesd v25.16b,v21.16b + aesimc v25.16b,v25.16b + aesd v26.16b,v21.16b + aesimc v26.16b,v26.16b + + aesd v0.16b,v22.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v22.16b + aesimc v1.16b,v1.16b + aesd v24.16b,v22.16b + aesimc v24.16b,v24.16b + aesd v25.16b,v22.16b + aesimc v25.16b,v25.16b + aesd v26.16b,v22.16b + aesimc v26.16b,v26.16b + + aesd v0.16b,v23.16b + ld1 {v2.16b},[x0],#16 + aesd v1.16b,v23.16b + ld1 {v3.16b},[x0],#16 + aesd v24.16b,v23.16b + ld1 {v27.16b},[x0],#16 + aesd v25.16b,v23.16b + ld1 {v28.16b},[x0],#16 + aesd v26.16b,v23.16b + ld1 {v29.16b},[x0],#16 + cbz x6,.Lecb_tail4x + ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] + eor v4.16b,v7.16b,v0.16b + orr v0.16b,v2.16b,v2.16b + eor v5.16b,v7.16b,v1.16b + orr v1.16b,v3.16b,v3.16b + eor v17.16b,v7.16b,v24.16b + orr v24.16b,v27.16b,v27.16b + eor v30.16b,v7.16b,v25.16b + orr v25.16b,v28.16b,v28.16b + eor v31.16b,v7.16b,v26.16b + st1 {v4.16b},[x1],#16 + orr v26.16b,v29.16b,v29.16b + st1 {v5.16b},[x1],#16 + mov w6,w5 + st1 {v17.16b},[x1],#16 + ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] + st1 {v30.16b},[x1],#16 + st1 {v31.16b},[x1],#16 + b.hs .Loop5x_ecb_dec + + add x2,x2,#0x50 + cbz x2,.Lecb_done + + add w6,w5,#2 + subs x2,x2,#0x30 + orr v0.16b,v27.16b,v27.16b + orr v1.16b,v28.16b,v28.16b + orr v24.16b,v29.16b,v29.16b + b.lo .Lecb_dec_tail + + b .Loop3x_ecb_dec + +.align 4 +.Lecb_tail4x: + eor v5.16b,v7.16b,v1.16b + eor v17.16b,v7.16b,v24.16b + eor v30.16b,v7.16b,v25.16b + eor v31.16b,v7.16b,v26.16b + st1 {v5.16b},[x1],#16 + st1 {v17.16b},[x1],#16 + st1 {v30.16b},[x1],#16 + st1 {v31.16b},[x1],#16 + + b .Lecb_done +.align 4 +.Loop3x_ecb_dec: + aesd v0.16b,v16.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v16.16b + aesimc v1.16b,v1.16b + aesd v24.16b,v16.16b + aesimc v24.16b,v24.16b + ld1 {v16.4s},[x7],#16 + subs w6,w6,#2 + aesd v0.16b,v17.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v17.16b + aesimc v1.16b,v1.16b + aesd v24.16b,v17.16b + aesimc v24.16b,v24.16b + ld1 {v17.4s},[x7],#16 + b.gt .Loop3x_ecb_dec + + aesd v0.16b,v16.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v16.16b + aesimc v1.16b,v1.16b + aesd v24.16b,v16.16b + aesimc v24.16b,v24.16b + subs x2,x2,#0x30 + csel x6,x2,x6,lo // x6, w6, is zero at this point + aesd v0.16b,v17.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v17.16b + aesimc v1.16b,v1.16b + aesd v24.16b,v17.16b + aesimc v24.16b,v24.16b + add x0,x0,x6 // x0 is adjusted in such way that + // at exit from the loop v1.16b-v24.16b + // are loaded with last "words" + mov x7,x3 + aesd v0.16b,v20.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v20.16b + aesimc v1.16b,v1.16b + aesd v24.16b,v20.16b + aesimc v24.16b,v24.16b + ld1 {v2.16b},[x0],#16 + aesd v0.16b,v21.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v21.16b + aesimc v1.16b,v1.16b + aesd v24.16b,v21.16b + aesimc v24.16b,v24.16b + ld1 {v3.16b},[x0],#16 + aesd v0.16b,v22.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v22.16b + aesimc v1.16b,v1.16b + aesd v24.16b,v22.16b + aesimc v24.16b,v24.16b + ld1 {v27.16b},[x0],#16 + aesd v0.16b,v23.16b + aesd v1.16b,v23.16b + aesd v24.16b,v23.16b + ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] + add w6,w5,#2 + eor v4.16b,v7.16b,v0.16b + eor v5.16b,v7.16b,v1.16b + eor v24.16b,v24.16b,v7.16b + ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] + st1 {v4.16b},[x1],#16 + orr v0.16b,v2.16b,v2.16b + st1 {v5.16b},[x1],#16 + orr v1.16b,v3.16b,v3.16b + st1 {v24.16b},[x1],#16 + orr v24.16b,v27.16b,v27.16b + b.hs .Loop3x_ecb_dec + + cmn x2,#0x30 + b.eq .Lecb_done + nop + +.Lecb_dec_tail: + aesd v1.16b,v16.16b + aesimc v1.16b,v1.16b + aesd v24.16b,v16.16b + aesimc v24.16b,v24.16b + ld1 {v16.4s},[x7],#16 + subs w6,w6,#2 + aesd v1.16b,v17.16b + aesimc v1.16b,v1.16b + aesd v24.16b,v17.16b + aesimc v24.16b,v24.16b + ld1 {v17.4s},[x7],#16 + b.gt .Lecb_dec_tail + + aesd v1.16b,v16.16b + aesimc v1.16b,v1.16b + aesd v24.16b,v16.16b + aesimc v24.16b,v24.16b + aesd v1.16b,v17.16b + aesimc v1.16b,v1.16b + aesd v24.16b,v17.16b + aesimc v24.16b,v24.16b + aesd v1.16b,v20.16b + aesimc v1.16b,v1.16b + aesd v24.16b,v20.16b + aesimc v24.16b,v24.16b + cmn x2,#0x20 + aesd v1.16b,v21.16b + aesimc v1.16b,v1.16b + aesd v24.16b,v21.16b + aesimc v24.16b,v24.16b + aesd v1.16b,v22.16b + aesimc v1.16b,v1.16b + aesd v24.16b,v22.16b + aesimc v24.16b,v24.16b + aesd v1.16b,v23.16b + aesd v24.16b,v23.16b + b.eq .Lecb_dec_one + eor v5.16b,v7.16b,v1.16b + eor v17.16b,v7.16b,v24.16b + st1 {v5.16b},[x1],#16 + st1 {v17.16b},[x1],#16 + b .Lecb_done + +.Lecb_dec_one: + eor v5.16b,v7.16b,v24.16b + st1 {v5.16b},[x1],#16 + +.Lecb_done: + ldr x29,[sp],#16 +.Lecb_Final_abort: + ret +.size aes_v8_ecb_encrypt,.-aes_v8_ecb_encrypt .globl aes_v8_cbc_encrypt .type aes_v8_cbc_encrypt,%function .align 5 @@ -416,35 +1163,220 @@ b .Lcbc_done .align 5 .Lcbc_dec: - ld1 {v18.16b},[x0],#16 + ld1 {v24.16b},[x0],#16 subs x2,x2,#32 // bias add w6,w5,#2 orr v3.16b,v0.16b,v0.16b orr v1.16b,v0.16b,v0.16b - orr v19.16b,v18.16b,v18.16b + orr v27.16b,v24.16b,v24.16b b.lo .Lcbc_dec_tail - orr v1.16b,v18.16b,v18.16b - ld1 {v18.16b},[x0],#16 + orr v1.16b,v24.16b,v24.16b + ld1 {v24.16b},[x0],#16 orr v2.16b,v0.16b,v0.16b orr v3.16b,v1.16b,v1.16b - orr v19.16b,v18.16b,v18.16b + orr v27.16b,v24.16b,v24.16b + cmp x2,#32 + b.lo .Loop3x_cbc_dec + ld1 {v25.16b},[x0],#16 + ld1 {v26.16b},[x0],#16 + sub x2,x2,#32 // bias + mov w6,w5 + orr v28.16b,v25.16b,v25.16b + orr v29.16b,v26.16b,v26.16b + +.Loop5x_cbc_dec: + aesd v0.16b,v16.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v16.16b + aesimc v1.16b,v1.16b + aesd v24.16b,v16.16b + aesimc v24.16b,v24.16b + aesd v25.16b,v16.16b + aesimc v25.16b,v25.16b + aesd v26.16b,v16.16b + aesimc v26.16b,v26.16b + ld1 {v16.4s},[x7],#16 + subs w6,w6,#2 + aesd v0.16b,v17.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v17.16b + aesimc v1.16b,v1.16b + aesd v24.16b,v17.16b + aesimc v24.16b,v24.16b + aesd v25.16b,v17.16b + aesimc v25.16b,v25.16b + aesd v26.16b,v17.16b + aesimc v26.16b,v26.16b + ld1 {v17.4s},[x7],#16 + b.gt .Loop5x_cbc_dec + + aesd v0.16b,v16.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v16.16b + aesimc v1.16b,v1.16b + aesd v24.16b,v16.16b + aesimc v24.16b,v24.16b + aesd v25.16b,v16.16b + aesimc v25.16b,v25.16b + aesd v26.16b,v16.16b + aesimc v26.16b,v26.16b + cmp x2,#0x40 // because .Lcbc_tail4x + sub x2,x2,#0x50 + + aesd v0.16b,v17.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v17.16b + aesimc v1.16b,v1.16b + aesd v24.16b,v17.16b + aesimc v24.16b,v24.16b + aesd v25.16b,v17.16b + aesimc v25.16b,v25.16b + aesd v26.16b,v17.16b + aesimc v26.16b,v26.16b + csel x6,xzr,x2,gt // borrow x6, w6, "gt" is not typo + mov x7,x3 + + aesd v0.16b,v18.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v18.16b + aesimc v1.16b,v1.16b + aesd v24.16b,v18.16b + aesimc v24.16b,v24.16b + aesd v25.16b,v18.16b + aesimc v25.16b,v25.16b + aesd v26.16b,v18.16b + aesimc v26.16b,v26.16b + add x0,x0,x6 // x0 is adjusted in such way that + // at exit from the loop v1.16b-v26.16b + // are loaded with last "words" + add x6,x2,#0x60 // because .Lcbc_tail4x + + aesd v0.16b,v19.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v19.16b + aesimc v1.16b,v1.16b + aesd v24.16b,v19.16b + aesimc v24.16b,v24.16b + aesd v25.16b,v19.16b + aesimc v25.16b,v25.16b + aesd v26.16b,v19.16b + aesimc v26.16b,v26.16b + + aesd v0.16b,v20.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v20.16b + aesimc v1.16b,v1.16b + aesd v24.16b,v20.16b + aesimc v24.16b,v24.16b + aesd v25.16b,v20.16b + aesimc v25.16b,v25.16b + aesd v26.16b,v20.16b + aesimc v26.16b,v26.16b + + aesd v0.16b,v21.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v21.16b + aesimc v1.16b,v1.16b + aesd v24.16b,v21.16b + aesimc v24.16b,v24.16b + aesd v25.16b,v21.16b + aesimc v25.16b,v25.16b + aesd v26.16b,v21.16b + aesimc v26.16b,v26.16b + + aesd v0.16b,v22.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v22.16b + aesimc v1.16b,v1.16b + aesd v24.16b,v22.16b + aesimc v24.16b,v24.16b + aesd v25.16b,v22.16b + aesimc v25.16b,v25.16b + aesd v26.16b,v22.16b + aesimc v26.16b,v26.16b + + eor v4.16b,v6.16b,v7.16b + aesd v0.16b,v23.16b + eor v5.16b,v2.16b,v7.16b + ld1 {v2.16b},[x0],#16 + aesd v1.16b,v23.16b + eor v17.16b,v3.16b,v7.16b + ld1 {v3.16b},[x0],#16 + aesd v24.16b,v23.16b + eor v30.16b,v27.16b,v7.16b + ld1 {v27.16b},[x0],#16 + aesd v25.16b,v23.16b + eor v31.16b,v28.16b,v7.16b + ld1 {v28.16b},[x0],#16 + aesd v26.16b,v23.16b + orr v6.16b,v29.16b,v29.16b + ld1 {v29.16b},[x0],#16 + cbz x6,.Lcbc_tail4x + ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] + eor v4.16b,v4.16b,v0.16b + orr v0.16b,v2.16b,v2.16b + eor v5.16b,v5.16b,v1.16b + orr v1.16b,v3.16b,v3.16b + eor v17.16b,v17.16b,v24.16b + orr v24.16b,v27.16b,v27.16b + eor v30.16b,v30.16b,v25.16b + orr v25.16b,v28.16b,v28.16b + eor v31.16b,v31.16b,v26.16b + st1 {v4.16b},[x1],#16 + orr v26.16b,v29.16b,v29.16b + st1 {v5.16b},[x1],#16 + mov w6,w5 + st1 {v17.16b},[x1],#16 + ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] + st1 {v30.16b},[x1],#16 + st1 {v31.16b},[x1],#16 + b.hs .Loop5x_cbc_dec + + add x2,x2,#0x50 + cbz x2,.Lcbc_done + + add w6,w5,#2 + subs x2,x2,#0x30 + orr v0.16b,v27.16b,v27.16b + orr v2.16b,v27.16b,v27.16b + orr v1.16b,v28.16b,v28.16b + orr v3.16b,v28.16b,v28.16b + orr v24.16b,v29.16b,v29.16b + orr v27.16b,v29.16b,v29.16b + b.lo .Lcbc_dec_tail + + b .Loop3x_cbc_dec + +.align 4 +.Lcbc_tail4x: + eor v5.16b,v4.16b,v1.16b + eor v17.16b,v17.16b,v24.16b + eor v30.16b,v30.16b,v25.16b + eor v31.16b,v31.16b,v26.16b + st1 {v5.16b},[x1],#16 + st1 {v17.16b},[x1],#16 + st1 {v30.16b},[x1],#16 + st1 {v31.16b},[x1],#16 + + b .Lcbc_done +.align 4 .Loop3x_cbc_dec: aesd v0.16b,v16.16b aesimc v0.16b,v0.16b aesd v1.16b,v16.16b aesimc v1.16b,v1.16b - aesd v18.16b,v16.16b - aesimc v18.16b,v18.16b + aesd v24.16b,v16.16b + aesimc v24.16b,v24.16b ld1 {v16.4s},[x7],#16 subs w6,w6,#2 aesd v0.16b,v17.16b aesimc v0.16b,v0.16b aesd v1.16b,v17.16b aesimc v1.16b,v1.16b - aesd v18.16b,v17.16b - aesimc v18.16b,v18.16b + aesd v24.16b,v17.16b + aesimc v24.16b,v24.16b ld1 {v17.4s},[x7],#16 b.gt .Loop3x_cbc_dec @@ -452,8 +1384,8 @@ aesimc v0.16b,v0.16b aesd v1.16b,v16.16b aesimc v1.16b,v1.16b - aesd v18.16b,v16.16b - aesimc v18.16b,v18.16b + aesd v24.16b,v16.16b + aesimc v24.16b,v24.16b eor v4.16b,v6.16b,v7.16b subs x2,x2,#0x30 eor v5.16b,v2.16b,v7.16b @@ -462,50 +1394,50 @@ aesimc v0.16b,v0.16b aesd v1.16b,v17.16b aesimc v1.16b,v1.16b - aesd v18.16b,v17.16b - aesimc v18.16b,v18.16b + aesd v24.16b,v17.16b + aesimc v24.16b,v24.16b eor v17.16b,v3.16b,v7.16b add x0,x0,x6 // x0 is adjusted in such way that - // at exit from the loop v1.16b-v18.16b + // at exit from the loop v1.16b-v24.16b // are loaded with last "words" - orr v6.16b,v19.16b,v19.16b + orr v6.16b,v27.16b,v27.16b mov x7,x3 aesd v0.16b,v20.16b aesimc v0.16b,v0.16b aesd v1.16b,v20.16b aesimc v1.16b,v1.16b - aesd v18.16b,v20.16b - aesimc v18.16b,v18.16b + aesd v24.16b,v20.16b + aesimc v24.16b,v24.16b ld1 {v2.16b},[x0],#16 aesd v0.16b,v21.16b aesimc v0.16b,v0.16b aesd v1.16b,v21.16b aesimc v1.16b,v1.16b - aesd v18.16b,v21.16b - aesimc v18.16b,v18.16b + aesd v24.16b,v21.16b + aesimc v24.16b,v24.16b ld1 {v3.16b},[x0],#16 aesd v0.16b,v22.16b aesimc v0.16b,v0.16b aesd v1.16b,v22.16b aesimc v1.16b,v1.16b - aesd v18.16b,v22.16b - aesimc v18.16b,v18.16b - ld1 {v19.16b},[x0],#16 + aesd v24.16b,v22.16b + aesimc v24.16b,v24.16b + ld1 {v27.16b},[x0],#16 aesd v0.16b,v23.16b aesd v1.16b,v23.16b - aesd v18.16b,v23.16b + aesd v24.16b,v23.16b ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] add w6,w5,#2 eor v4.16b,v4.16b,v0.16b eor v5.16b,v5.16b,v1.16b - eor v18.16b,v18.16b,v17.16b + eor v24.16b,v24.16b,v17.16b ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] st1 {v4.16b},[x1],#16 orr v0.16b,v2.16b,v2.16b st1 {v5.16b},[x1],#16 orr v1.16b,v3.16b,v3.16b - st1 {v18.16b},[x1],#16 - orr v18.16b,v19.16b,v19.16b + st1 {v24.16b},[x1],#16 + orr v24.16b,v27.16b,v27.16b b.hs .Loop3x_cbc_dec cmn x2,#0x30 @@ -515,53 +1447,53 @@ .Lcbc_dec_tail: aesd v1.16b,v16.16b aesimc v1.16b,v1.16b - aesd v18.16b,v16.16b - aesimc v18.16b,v18.16b + aesd v24.16b,v16.16b + aesimc v24.16b,v24.16b ld1 {v16.4s},[x7],#16 subs w6,w6,#2 aesd v1.16b,v17.16b aesimc v1.16b,v1.16b - aesd v18.16b,v17.16b - aesimc v18.16b,v18.16b + aesd v24.16b,v17.16b + aesimc v24.16b,v24.16b ld1 {v17.4s},[x7],#16 b.gt .Lcbc_dec_tail aesd v1.16b,v16.16b aesimc v1.16b,v1.16b - aesd v18.16b,v16.16b - aesimc v18.16b,v18.16b + aesd v24.16b,v16.16b + aesimc v24.16b,v24.16b aesd v1.16b,v17.16b aesimc v1.16b,v1.16b - aesd v18.16b,v17.16b - aesimc v18.16b,v18.16b + aesd v24.16b,v17.16b + aesimc v24.16b,v24.16b aesd v1.16b,v20.16b aesimc v1.16b,v1.16b - aesd v18.16b,v20.16b - aesimc v18.16b,v18.16b + aesd v24.16b,v20.16b + aesimc v24.16b,v24.16b cmn x2,#0x20 aesd v1.16b,v21.16b aesimc v1.16b,v1.16b - aesd v18.16b,v21.16b - aesimc v18.16b,v18.16b + aesd v24.16b,v21.16b + aesimc v24.16b,v24.16b eor v5.16b,v6.16b,v7.16b aesd v1.16b,v22.16b aesimc v1.16b,v1.16b - aesd v18.16b,v22.16b - aesimc v18.16b,v18.16b + aesd v24.16b,v22.16b + aesimc v24.16b,v24.16b eor v17.16b,v3.16b,v7.16b aesd v1.16b,v23.16b - aesd v18.16b,v23.16b + aesd v24.16b,v23.16b b.eq .Lcbc_dec_one eor v5.16b,v5.16b,v1.16b - eor v17.16b,v17.16b,v18.16b - orr v6.16b,v19.16b,v19.16b + eor v17.16b,v17.16b,v24.16b + orr v6.16b,v27.16b,v27.16b st1 {v5.16b},[x1],#16 st1 {v17.16b},[x1],#16 b .Lcbc_done .Lcbc_dec_one: - eor v5.16b,v5.16b,v18.16b - orr v6.16b,v19.16b,v19.16b + eor v5.16b,v5.16b,v24.16b + orr v6.16b,v27.16b,v27.16b st1 {v5.16b},[x1],#16 .Lcbc_done: @@ -610,6 +1542,171 @@ rev w12, w8 sub x2,x2,#3 // bias mov v18.s[3],w12 + cmp x2,#32 + b.lo .Loop3x_ctr32 + + add w13,w8,#1 + add w14,w8,#2 + orr v24.16b,v0.16b,v0.16b + rev w13,w13 + orr v25.16b,v0.16b,v0.16b + rev w14,w14 + mov v24.s[3],w13 + sub x2,x2,#2 // bias + mov v25.s[3],w14 + add w8,w8,#2 + b .Loop5x_ctr32 + +.align 4 +.Loop5x_ctr32: + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + aese v1.16b,v16.16b + aesmc v1.16b,v1.16b + aese v18.16b,v16.16b + aesmc v18.16b,v18.16b + aese v24.16b,v16.16b + aesmc v24.16b,v24.16b + aese v25.16b,v16.16b + aesmc v25.16b,v25.16b + ld1 {v16.4s},[x7],#16 + subs w6,w6,#2 + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + aese v1.16b,v17.16b + aesmc v1.16b,v1.16b + aese v18.16b,v17.16b + aesmc v18.16b,v18.16b + aese v24.16b,v17.16b + aesmc v24.16b,v24.16b + aese v25.16b,v17.16b + aesmc v25.16b,v25.16b + ld1 {v17.4s},[x7],#16 + b.gt .Loop5x_ctr32 + + mov x7,x3 + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + aese v1.16b,v16.16b + aesmc v1.16b,v1.16b + aese v18.16b,v16.16b + aesmc v18.16b,v18.16b + aese v24.16b,v16.16b + aesmc v24.16b,v24.16b + aese v25.16b,v16.16b + aesmc v25.16b,v25.16b + ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] + + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + aese v1.16b,v17.16b + aesmc v1.16b,v1.16b + aese v18.16b,v17.16b + aesmc v18.16b,v18.16b + aese v24.16b,v17.16b + aesmc v24.16b,v24.16b + aese v25.16b,v17.16b + aesmc v25.16b,v25.16b + ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] + + aese v0.16b,v20.16b + aesmc v0.16b,v0.16b + add w9,w8,#1 + add w10,w8,#2 + aese v1.16b,v20.16b + aesmc v1.16b,v1.16b + add w12,w8,#3 + add w13,w8,#4 + aese v18.16b,v20.16b + aesmc v18.16b,v18.16b + add w14,w8,#5 + rev w9,w9 + aese v24.16b,v20.16b + aesmc v24.16b,v24.16b + rev w10,w10 + rev w12,w12 + aese v25.16b,v20.16b + aesmc v25.16b,v25.16b + rev w13,w13 + rev w14,w14 + + aese v0.16b,v21.16b + aesmc v0.16b,v0.16b + aese v1.16b,v21.16b + aesmc v1.16b,v1.16b + aese v18.16b,v21.16b + aesmc v18.16b,v18.16b + aese v24.16b,v21.16b + aesmc v24.16b,v24.16b + aese v25.16b,v21.16b + aesmc v25.16b,v25.16b + + aese v0.16b,v22.16b + aesmc v0.16b,v0.16b + ld1 {v2.16b},[x0],#16 + aese v1.16b,v22.16b + aesmc v1.16b,v1.16b + ld1 {v3.16b},[x0],#16 + aese v18.16b,v22.16b + aesmc v18.16b,v18.16b + ld1 {v19.16b},[x0],#16 + aese v24.16b,v22.16b + aesmc v24.16b,v24.16b + ld1 {v26.16b},[x0],#16 + aese v25.16b,v22.16b + aesmc v25.16b,v25.16b + ld1 {v27.16b},[x0],#16 + + aese v0.16b,v23.16b + eor v2.16b,v2.16b,v7.16b + aese v1.16b,v23.16b + eor v3.16b,v3.16b,v7.16b + aese v18.16b,v23.16b + eor v19.16b,v19.16b,v7.16b + aese v24.16b,v23.16b + eor v26.16b,v26.16b,v7.16b + aese v25.16b,v23.16b + eor v27.16b,v27.16b,v7.16b + + eor v2.16b,v2.16b,v0.16b + orr v0.16b,v6.16b,v6.16b + eor v3.16b,v3.16b,v1.16b + orr v1.16b,v6.16b,v6.16b + eor v19.16b,v19.16b,v18.16b + orr v18.16b,v6.16b,v6.16b + eor v26.16b,v26.16b,v24.16b + orr v24.16b,v6.16b,v6.16b + eor v27.16b,v27.16b,v25.16b + orr v25.16b,v6.16b,v6.16b + + st1 {v2.16b},[x1],#16 + mov v0.s[3],w9 + st1 {v3.16b},[x1],#16 + mov v1.s[3],w10 + st1 {v19.16b},[x1],#16 + mov v18.s[3],w12 + st1 {v26.16b},[x1],#16 + mov v24.s[3],w13 + st1 {v27.16b},[x1],#16 + mov v25.s[3],w14 + + mov w6,w5 + cbz x2,.Lctr32_done + + add w8,w8,#5 + subs x2,x2,#5 + b.hs .Loop5x_ctr32 + + add x2,x2,#5 + sub w8,w8,#5 + + cmp x2,#2 + mov x12,#16 + csel x12,xzr,x12,lo + b.ls .Lctr32_tail + + sub x2,x2,#3 // bias + add w8,w8,#3 b .Loop3x_ctr32 .align 4 @@ -754,4 +1851,1331 @@ ldr x29,[sp],#16 ret .size aes_v8_ctr32_encrypt_blocks,.-aes_v8_ctr32_encrypt_blocks +.globl aes_v8_xts_encrypt +.type aes_v8_xts_encrypt,%function +.align 5 +aes_v8_xts_encrypt: + cmp x2,#16 + // Original input data size bigger than 16, jump to big size processing. + b.ne .Lxts_enc_big_size + // Encrypt the iv with key2, as the first XEX iv. + ldr w6,[x4,#240] + ld1 {v0.4s},[x4],#16 + ld1 {v6.16b},[x5] + sub w6,w6,#2 + ld1 {v1.4s},[x4],#16 + +.Loop_enc_iv_enc: + aese v6.16b,v0.16b + aesmc v6.16b,v6.16b + ld1 {v0.4s},[x4],#16 + subs w6,w6,#2 + aese v6.16b,v1.16b + aesmc v6.16b,v6.16b + ld1 {v1.4s},[x4],#16 + b.gt .Loop_enc_iv_enc + + aese v6.16b,v0.16b + aesmc v6.16b,v6.16b + ld1 {v0.4s},[x4] + aese v6.16b,v1.16b + eor v6.16b,v6.16b,v0.16b + + ld1 {v0.16b},[x0] + eor v0.16b,v6.16b,v0.16b + + ldr w6,[x3,#240] + ld1 {v28.4s,v29.4s},[x3],#32 // load key schedule... + + aese v0.16b,v28.16b + aesmc v0.16b,v0.16b + ld1 {v16.4s,v17.4s},[x3],#32 // load key schedule... + aese v0.16b,v29.16b + aesmc v0.16b,v0.16b + subs w6,w6,#10 // if rounds==10, jump to aes-128-xts processing + b.eq .Lxts_128_enc +.Lxts_enc_round_loop: + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + ld1 {v16.4s},[x3],#16 // load key schedule... + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + ld1 {v17.4s},[x3],#16 // load key schedule... + subs w6,w6,#2 // bias + b.gt .Lxts_enc_round_loop +.Lxts_128_enc: + ld1 {v18.4s,v19.4s},[x3],#32 // load key schedule... + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + ld1 {v20.4s,v21.4s},[x3],#32 // load key schedule... + aese v0.16b,v18.16b + aesmc v0.16b,v0.16b + aese v0.16b,v19.16b + aesmc v0.16b,v0.16b + ld1 {v22.4s,v23.4s},[x3],#32 // load key schedule... + aese v0.16b,v20.16b + aesmc v0.16b,v0.16b + aese v0.16b,v21.16b + aesmc v0.16b,v0.16b + ld1 {v7.4s},[x3] + aese v0.16b,v22.16b + aesmc v0.16b,v0.16b + aese v0.16b,v23.16b + eor v0.16b,v0.16b,v7.16b + eor v0.16b,v0.16b,v6.16b + st1 {v0.16b},[x1] + b .Lxts_enc_final_abort + +.align 4 +.Lxts_enc_big_size: + stp x19,x20,[sp,#-64]! + stp x21,x22,[sp,#48] + stp d8,d9,[sp,#32] + stp d10,d11,[sp,#16] + + // tailcnt store the tail value of length%16. + and x21,x2,#0xf + and x2,x2,#-16 + subs x2,x2,#16 + mov x8,#16 + b.lo .Lxts_abort + csel x8,xzr,x8,eq + + // Firstly, encrypt the iv with key2, as the first iv of XEX. + ldr w6,[x4,#240] + ld1 {v0.4s},[x4],#16 + ld1 {v6.16b},[x5] + sub w6,w6,#2 + ld1 {v1.4s},[x4],#16 + +.Loop_iv_enc: + aese v6.16b,v0.16b + aesmc v6.16b,v6.16b + ld1 {v0.4s},[x4],#16 + subs w6,w6,#2 + aese v6.16b,v1.16b + aesmc v6.16b,v6.16b + ld1 {v1.4s},[x4],#16 + b.gt .Loop_iv_enc + + aese v6.16b,v0.16b + aesmc v6.16b,v6.16b + ld1 {v0.4s},[x4] + aese v6.16b,v1.16b + eor v6.16b,v6.16b,v0.16b + + // The iv for second block + // x9- iv(low), x10 - iv(high) + // the five ivs stored into, v6.16b,v8.16b,v9.16b,v10.16b,v11.16b + fmov x9,d6 + fmov x10,v6.d[1] + mov w19,#0x87 + extr x22,x10,x10,#32 + extr x10,x10,x9,#63 + and w11,w19,w22,asr#31 + eor x9,x11,x9,lsl#1 + fmov d8,x9 + fmov v8.d[1],x10 + + ldr w5,[x3,#240] // next starting point + ld1 {v0.16b},[x0],x8 + + ld1 {v16.4s,v17.4s},[x3] // load key schedule... + sub w5,w5,#6 + add x7,x3,x5,lsl#4 // pointer to last 7 round keys + sub w5,w5,#2 + ld1 {v18.4s,v19.4s},[x7],#32 + ld1 {v20.4s,v21.4s},[x7],#32 + ld1 {v22.4s,v23.4s},[x7],#32 + ld1 {v7.4s},[x7] + + add x7,x3,#32 + mov w6,w5 + + // Encryption +.Lxts_enc: + ld1 {v24.16b},[x0],#16 + subs x2,x2,#32 // bias + add w6,w5,#2 + orr v3.16b,v0.16b,v0.16b + orr v1.16b,v0.16b,v0.16b + orr v28.16b,v0.16b,v0.16b + orr v27.16b,v24.16b,v24.16b + orr v29.16b,v24.16b,v24.16b + b.lo .Lxts_inner_enc_tail + eor v0.16b,v0.16b,v6.16b // before encryption, xor with iv + eor v24.16b,v24.16b,v8.16b + + // The iv for third block + extr x22,x10,x10,#32 + extr x10,x10,x9,#63 + and w11,w19,w22,asr#31 + eor x9,x11,x9,lsl#1 + fmov d9,x9 + fmov v9.d[1],x10 + + + orr v1.16b,v24.16b,v24.16b + ld1 {v24.16b},[x0],#16 + orr v2.16b,v0.16b,v0.16b + orr v3.16b,v1.16b,v1.16b + eor v27.16b,v24.16b,v9.16b // the third block + eor v24.16b,v24.16b,v9.16b + cmp x2,#32 + b.lo .Lxts_outer_enc_tail + + // The iv for fourth block + extr x22,x10,x10,#32 + extr x10,x10,x9,#63 + and w11,w19,w22,asr#31 + eor x9,x11,x9,lsl#1 + fmov d10,x9 + fmov v10.d[1],x10 + + ld1 {v25.16b},[x0],#16 + // The iv for fifth block + extr x22,x10,x10,#32 + extr x10,x10,x9,#63 + and w11,w19,w22,asr#31 + eor x9,x11,x9,lsl#1 + fmov d11,x9 + fmov v11.d[1],x10 + + ld1 {v26.16b},[x0],#16 + eor v25.16b,v25.16b,v10.16b // the fourth block + eor v26.16b,v26.16b,v11.16b + sub x2,x2,#32 // bias + mov w6,w5 + b .Loop5x_xts_enc + +.align 4 +.Loop5x_xts_enc: + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + aese v1.16b,v16.16b + aesmc v1.16b,v1.16b + aese v24.16b,v16.16b + aesmc v24.16b,v24.16b + aese v25.16b,v16.16b + aesmc v25.16b,v25.16b + aese v26.16b,v16.16b + aesmc v26.16b,v26.16b + ld1 {v16.4s},[x7],#16 + subs w6,w6,#2 + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + aese v1.16b,v17.16b + aesmc v1.16b,v1.16b + aese v24.16b,v17.16b + aesmc v24.16b,v24.16b + aese v25.16b,v17.16b + aesmc v25.16b,v25.16b + aese v26.16b,v17.16b + aesmc v26.16b,v26.16b + ld1 {v17.4s},[x7],#16 + b.gt .Loop5x_xts_enc + + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + aese v1.16b,v16.16b + aesmc v1.16b,v1.16b + aese v24.16b,v16.16b + aesmc v24.16b,v24.16b + aese v25.16b,v16.16b + aesmc v25.16b,v25.16b + aese v26.16b,v16.16b + aesmc v26.16b,v26.16b + subs x2,x2,#0x50 // because .Lxts_enc_tail4x + + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + aese v1.16b,v17.16b + aesmc v1.16b,v1.16b + aese v24.16b,v17.16b + aesmc v24.16b,v24.16b + aese v25.16b,v17.16b + aesmc v25.16b,v25.16b + aese v26.16b,v17.16b + aesmc v26.16b,v26.16b + csel x6,xzr,x2,gt // borrow x6, w6, "gt" is not typo + mov x7,x3 + + aese v0.16b,v18.16b + aesmc v0.16b,v0.16b + aese v1.16b,v18.16b + aesmc v1.16b,v1.16b + aese v24.16b,v18.16b + aesmc v24.16b,v24.16b + aese v25.16b,v18.16b + aesmc v25.16b,v25.16b + aese v26.16b,v18.16b + aesmc v26.16b,v26.16b + add x0,x0,x6 // x0 is adjusted in such way that + // at exit from the loop v1.16b-v26.16b + // are loaded with last "words" + add x6,x2,#0x60 // because .Lxts_enc_tail4x + + aese v0.16b,v19.16b + aesmc v0.16b,v0.16b + aese v1.16b,v19.16b + aesmc v1.16b,v1.16b + aese v24.16b,v19.16b + aesmc v24.16b,v24.16b + aese v25.16b,v19.16b + aesmc v25.16b,v25.16b + aese v26.16b,v19.16b + aesmc v26.16b,v26.16b + + aese v0.16b,v20.16b + aesmc v0.16b,v0.16b + aese v1.16b,v20.16b + aesmc v1.16b,v1.16b + aese v24.16b,v20.16b + aesmc v24.16b,v24.16b + aese v25.16b,v20.16b + aesmc v25.16b,v25.16b + aese v26.16b,v20.16b + aesmc v26.16b,v26.16b + + aese v0.16b,v21.16b + aesmc v0.16b,v0.16b + aese v1.16b,v21.16b + aesmc v1.16b,v1.16b + aese v24.16b,v21.16b + aesmc v24.16b,v24.16b + aese v25.16b,v21.16b + aesmc v25.16b,v25.16b + aese v26.16b,v21.16b + aesmc v26.16b,v26.16b + + aese v0.16b,v22.16b + aesmc v0.16b,v0.16b + aese v1.16b,v22.16b + aesmc v1.16b,v1.16b + aese v24.16b,v22.16b + aesmc v24.16b,v24.16b + aese v25.16b,v22.16b + aesmc v25.16b,v25.16b + aese v26.16b,v22.16b + aesmc v26.16b,v26.16b + + eor v4.16b,v7.16b,v6.16b + aese v0.16b,v23.16b + // The iv for first block of one iteration + extr x22,x10,x10,#32 + extr x10,x10,x9,#63 + and w11,w19,w22,asr#31 + eor x9,x11,x9,lsl#1 + fmov d6,x9 + fmov v6.d[1],x10 + eor v5.16b,v7.16b,v8.16b + ld1 {v2.16b},[x0],#16 + aese v1.16b,v23.16b + // The iv for second block + extr x22,x10,x10,#32 + extr x10,x10,x9,#63 + and w11,w19,w22,asr#31 + eor x9,x11,x9,lsl#1 + fmov d8,x9 + fmov v8.d[1],x10 + eor v17.16b,v7.16b,v9.16b + ld1 {v3.16b},[x0],#16 + aese v24.16b,v23.16b + // The iv for third block + extr x22,x10,x10,#32 + extr x10,x10,x9,#63 + and w11,w19,w22,asr#31 + eor x9,x11,x9,lsl#1 + fmov d9,x9 + fmov v9.d[1],x10 + eor v30.16b,v7.16b,v10.16b + ld1 {v27.16b},[x0],#16 + aese v25.16b,v23.16b + // The iv for fourth block + extr x22,x10,x10,#32 + extr x10,x10,x9,#63 + and w11,w19,w22,asr#31 + eor x9,x11,x9,lsl#1 + fmov d10,x9 + fmov v10.d[1],x10 + eor v31.16b,v7.16b,v11.16b + ld1 {v28.16b},[x0],#16 + aese v26.16b,v23.16b + + // The iv for fifth block + extr x22,x10,x10,#32 + extr x10,x10,x9,#63 + and w11,w19,w22,asr #31 + eor x9,x11,x9,lsl #1 + fmov d11,x9 + fmov v11.d[1],x10 + + ld1 {v29.16b},[x0],#16 + cbz x6,.Lxts_enc_tail4x + ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] + eor v4.16b,v4.16b,v0.16b + eor v0.16b,v2.16b,v6.16b + eor v5.16b,v5.16b,v1.16b + eor v1.16b,v3.16b,v8.16b + eor v17.16b,v17.16b,v24.16b + eor v24.16b,v27.16b,v9.16b + eor v30.16b,v30.16b,v25.16b + eor v25.16b,v28.16b,v10.16b + eor v31.16b,v31.16b,v26.16b + st1 {v4.16b},[x1],#16 + eor v26.16b,v29.16b,v11.16b + st1 {v5.16b},[x1],#16 + mov w6,w5 + st1 {v17.16b},[x1],#16 + ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] + st1 {v30.16b},[x1],#16 + st1 {v31.16b},[x1],#16 + b.hs .Loop5x_xts_enc + + + // If left 4 blocks, borrow the five block's processing. + cmn x2,#0x10 + b.ne .Loop5x_enc_after + orr v11.16b,v10.16b,v10.16b + orr v10.16b,v9.16b,v9.16b + orr v9.16b,v8.16b,v8.16b + orr v8.16b,v6.16b,v6.16b + fmov x9,d11 + fmov x10,v11.d[1] + eor v0.16b,v6.16b,v2.16b + eor v1.16b,v8.16b,v3.16b + eor v24.16b,v27.16b,v9.16b + eor v25.16b,v28.16b,v10.16b + eor v26.16b,v29.16b,v11.16b + b.eq .Loop5x_xts_enc + +.Loop5x_enc_after: + add x2,x2,#0x50 + cbz x2,.Lxts_enc_done + + add w6,w5,#2 + subs x2,x2,#0x30 + b.lo .Lxts_inner_enc_tail + + eor v0.16b,v6.16b,v27.16b + eor v1.16b,v8.16b,v28.16b + eor v24.16b,v29.16b,v9.16b + b .Lxts_outer_enc_tail + +.align 4 +.Lxts_enc_tail4x: + add x0,x0,#16 + eor v5.16b,v1.16b,v5.16b + st1 {v5.16b},[x1],#16 + eor v17.16b,v24.16b,v17.16b + st1 {v17.16b},[x1],#16 + eor v30.16b,v25.16b,v30.16b + eor v31.16b,v26.16b,v31.16b + st1 {v30.16b,v31.16b},[x1],#32 + + b .Lxts_enc_done +.align 4 +.Lxts_outer_enc_tail: + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + aese v1.16b,v16.16b + aesmc v1.16b,v1.16b + aese v24.16b,v16.16b + aesmc v24.16b,v24.16b + ld1 {v16.4s},[x7],#16 + subs w6,w6,#2 + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + aese v1.16b,v17.16b + aesmc v1.16b,v1.16b + aese v24.16b,v17.16b + aesmc v24.16b,v24.16b + ld1 {v17.4s},[x7],#16 + b.gt .Lxts_outer_enc_tail + + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + aese v1.16b,v16.16b + aesmc v1.16b,v1.16b + aese v24.16b,v16.16b + aesmc v24.16b,v24.16b + eor v4.16b,v6.16b,v7.16b + subs x2,x2,#0x30 + // The iv for first block + fmov x9,d9 + fmov x10,v9.d[1] + //mov w19,#0x87 + extr x22,x10,x10,#32 + extr x10,x10,x9,#63 + and w11,w19,w22,asr#31 + eor x9,x11,x9,lsl#1 + fmov d6,x9 + fmov v6.d[1],x10 + eor v5.16b,v8.16b,v7.16b + csel x6,x2,x6,lo // x6, w6, is zero at this point + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + aese v1.16b,v17.16b + aesmc v1.16b,v1.16b + aese v24.16b,v17.16b + aesmc v24.16b,v24.16b + eor v17.16b,v9.16b,v7.16b + + add x6,x6,#0x20 + add x0,x0,x6 + mov x7,x3 + + aese v0.16b,v20.16b + aesmc v0.16b,v0.16b + aese v1.16b,v20.16b + aesmc v1.16b,v1.16b + aese v24.16b,v20.16b + aesmc v24.16b,v24.16b + aese v0.16b,v21.16b + aesmc v0.16b,v0.16b + aese v1.16b,v21.16b + aesmc v1.16b,v1.16b + aese v24.16b,v21.16b + aesmc v24.16b,v24.16b + aese v0.16b,v22.16b + aesmc v0.16b,v0.16b + aese v1.16b,v22.16b + aesmc v1.16b,v1.16b + aese v24.16b,v22.16b + aesmc v24.16b,v24.16b + aese v0.16b,v23.16b + aese v1.16b,v23.16b + aese v24.16b,v23.16b + ld1 {v27.16b},[x0],#16 + add w6,w5,#2 + ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] + eor v4.16b,v4.16b,v0.16b + eor v5.16b,v5.16b,v1.16b + eor v24.16b,v24.16b,v17.16b + ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] + st1 {v4.16b},[x1],#16 + st1 {v5.16b},[x1],#16 + st1 {v24.16b},[x1],#16 + cmn x2,#0x30 + b.eq .Lxts_enc_done +.Lxts_encxor_one: + orr v28.16b,v3.16b,v3.16b + orr v29.16b,v27.16b,v27.16b + nop + +.Lxts_inner_enc_tail: + cmn x2,#0x10 + eor v1.16b,v28.16b,v6.16b + eor v24.16b,v29.16b,v8.16b + b.eq .Lxts_enc_tail_loop + eor v24.16b,v29.16b,v6.16b +.Lxts_enc_tail_loop: + aese v1.16b,v16.16b + aesmc v1.16b,v1.16b + aese v24.16b,v16.16b + aesmc v24.16b,v24.16b + ld1 {v16.4s},[x7],#16 + subs w6,w6,#2 + aese v1.16b,v17.16b + aesmc v1.16b,v1.16b + aese v24.16b,v17.16b + aesmc v24.16b,v24.16b + ld1 {v17.4s},[x7],#16 + b.gt .Lxts_enc_tail_loop + + aese v1.16b,v16.16b + aesmc v1.16b,v1.16b + aese v24.16b,v16.16b + aesmc v24.16b,v24.16b + aese v1.16b,v17.16b + aesmc v1.16b,v1.16b + aese v24.16b,v17.16b + aesmc v24.16b,v24.16b + aese v1.16b,v20.16b + aesmc v1.16b,v1.16b + aese v24.16b,v20.16b + aesmc v24.16b,v24.16b + cmn x2,#0x20 + aese v1.16b,v21.16b + aesmc v1.16b,v1.16b + aese v24.16b,v21.16b + aesmc v24.16b,v24.16b + eor v5.16b,v6.16b,v7.16b + aese v1.16b,v22.16b + aesmc v1.16b,v1.16b + aese v24.16b,v22.16b + aesmc v24.16b,v24.16b + eor v17.16b,v8.16b,v7.16b + aese v1.16b,v23.16b + aese v24.16b,v23.16b + b.eq .Lxts_enc_one + eor v5.16b,v5.16b,v1.16b + st1 {v5.16b},[x1],#16 + eor v17.16b,v17.16b,v24.16b + orr v6.16b,v8.16b,v8.16b + st1 {v17.16b},[x1],#16 + fmov x9,d8 + fmov x10,v8.d[1] + mov w19,#0x87 + extr x22,x10,x10,#32 + extr x10,x10,x9,#63 + and w11,w19,w22,asr #31 + eor x9,x11,x9,lsl #1 + fmov d6,x9 + fmov v6.d[1],x10 + b .Lxts_enc_done + +.Lxts_enc_one: + eor v5.16b,v5.16b,v24.16b + orr v6.16b,v6.16b,v6.16b + st1 {v5.16b},[x1],#16 + fmov x9,d6 + fmov x10,v6.d[1] + mov w19,#0x87 + extr x22,x10,x10,#32 + extr x10,x10,x9,#63 + and w11,w19,w22,asr #31 + eor x9,x11,x9,lsl #1 + fmov d6,x9 + fmov v6.d[1],x10 + b .Lxts_enc_done +.align 5 +.Lxts_enc_done: + // Process the tail block with cipher stealing. + tst x21,#0xf + b.eq .Lxts_abort + + mov x20,x0 + mov x13,x1 + sub x1,x1,#16 +.composite_enc_loop: + subs x21,x21,#1 + ldrb w15,[x1,x21] + ldrb w14,[x20,x21] + strb w15,[x13,x21] + strb w14,[x1,x21] + b.gt .composite_enc_loop +.Lxts_enc_load_done: + ld1 {v26.16b},[x1] + eor v26.16b,v26.16b,v6.16b + + // Encrypt the composite block to get the last second encrypted text block + ldr w6,[x3,#240] // load key schedule... + ld1 {v0.4s},[x3],#16 + sub w6,w6,#2 + ld1 {v1.4s},[x3],#16 // load key schedule... +.Loop_final_enc: + aese v26.16b,v0.16b + aesmc v26.16b,v26.16b + ld1 {v0.4s},[x3],#16 + subs w6,w6,#2 + aese v26.16b,v1.16b + aesmc v26.16b,v26.16b + ld1 {v1.4s},[x3],#16 + b.gt .Loop_final_enc + + aese v26.16b,v0.16b + aesmc v26.16b,v26.16b + ld1 {v0.4s},[x3] + aese v26.16b,v1.16b + eor v26.16b,v26.16b,v0.16b + eor v26.16b,v26.16b,v6.16b + st1 {v26.16b},[x1] + +.Lxts_abort: + ldp x21,x22,[sp,#48] + ldp d8,d9,[sp,#32] + ldp d10,d11,[sp,#16] + ldp x19,x20,[sp],#64 +.Lxts_enc_final_abort: + ret +.size aes_v8_xts_encrypt,.-aes_v8_xts_encrypt +.globl aes_v8_xts_decrypt +.type aes_v8_xts_decrypt,%function +.align 5 +aes_v8_xts_decrypt: + cmp x2,#16 + // Original input data size bigger than 16, jump to big size processing. + b.ne .Lxts_dec_big_size + // Encrypt the iv with key2, as the first XEX iv. + ldr w6,[x4,#240] + ld1 {v0.4s},[x4],#16 + ld1 {v6.16b},[x5] + sub w6,w6,#2 + ld1 {v1.4s},[x4],#16 + +.Loop_dec_small_iv_enc: + aese v6.16b,v0.16b + aesmc v6.16b,v6.16b + ld1 {v0.4s},[x4],#16 + subs w6,w6,#2 + aese v6.16b,v1.16b + aesmc v6.16b,v6.16b + ld1 {v1.4s},[x4],#16 + b.gt .Loop_dec_small_iv_enc + + aese v6.16b,v0.16b + aesmc v6.16b,v6.16b + ld1 {v0.4s},[x4] + aese v6.16b,v1.16b + eor v6.16b,v6.16b,v0.16b + + ld1 {v0.16b},[x0] + eor v0.16b,v6.16b,v0.16b + + ldr w6,[x3,#240] + ld1 {v28.4s,v29.4s},[x3],#32 // load key schedule... + + aesd v0.16b,v28.16b + aesimc v0.16b,v0.16b + ld1 {v16.4s,v17.4s},[x3],#32 // load key schedule... + aesd v0.16b,v29.16b + aesimc v0.16b,v0.16b + subs w6,w6,#10 // bias + b.eq .Lxts_128_dec +.Lxts_dec_round_loop: + aesd v0.16b,v16.16b + aesimc v0.16b,v0.16b + ld1 {v16.4s},[x3],#16 // load key schedule... + aesd v0.16b,v17.16b + aesimc v0.16b,v0.16b + ld1 {v17.4s},[x3],#16 // load key schedule... + subs w6,w6,#2 // bias + b.gt .Lxts_dec_round_loop +.Lxts_128_dec: + ld1 {v18.4s,v19.4s},[x3],#32 // load key schedule... + aesd v0.16b,v16.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v17.16b + aesimc v0.16b,v0.16b + ld1 {v20.4s,v21.4s},[x3],#32 // load key schedule... + aesd v0.16b,v18.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v19.16b + aesimc v0.16b,v0.16b + ld1 {v22.4s,v23.4s},[x3],#32 // load key schedule... + aesd v0.16b,v20.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v21.16b + aesimc v0.16b,v0.16b + ld1 {v7.4s},[x3] + aesd v0.16b,v22.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v23.16b + eor v0.16b,v0.16b,v7.16b + eor v0.16b,v6.16b,v0.16b + st1 {v0.16b},[x1] + b .Lxts_dec_final_abort +.Lxts_dec_big_size: + stp x19,x20,[sp,#-64]! + stp x21,x22,[sp,#48] + stp d8,d9,[sp,#32] + stp d10,d11,[sp,#16] + + and x21,x2,#0xf + and x2,x2,#-16 + subs x2,x2,#16 + mov x8,#16 + b.lo .Lxts_dec_abort + + // Encrypt the iv with key2, as the first XEX iv + ldr w6,[x4,#240] + ld1 {v0.4s},[x4],#16 + ld1 {v6.16b},[x5] + sub w6,w6,#2 + ld1 {v1.4s},[x4],#16 + +.Loop_dec_iv_enc: + aese v6.16b,v0.16b + aesmc v6.16b,v6.16b + ld1 {v0.4s},[x4],#16 + subs w6,w6,#2 + aese v6.16b,v1.16b + aesmc v6.16b,v6.16b + ld1 {v1.4s},[x4],#16 + b.gt .Loop_dec_iv_enc + + aese v6.16b,v0.16b + aesmc v6.16b,v6.16b + ld1 {v0.4s},[x4] + aese v6.16b,v1.16b + eor v6.16b,v6.16b,v0.16b + + // The iv for second block + // x9- iv(low), x10 - iv(high) + // the five ivs stored into, v6.16b,v8.16b,v9.16b,v10.16b,v11.16b + fmov x9,d6 + fmov x10,v6.d[1] + mov w19,#0x87 + extr x22,x10,x10,#32 + extr x10,x10,x9,#63 + and w11,w19,w22,asr #31 + eor x9,x11,x9,lsl #1 + fmov d8,x9 + fmov v8.d[1],x10 + + ldr w5,[x3,#240] // load rounds number + + // The iv for third block + extr x22,x10,x10,#32 + extr x10,x10,x9,#63 + and w11,w19,w22,asr #31 + eor x9,x11,x9,lsl #1 + fmov d9,x9 + fmov v9.d[1],x10 + + ld1 {v16.4s,v17.4s},[x3] // load key schedule... + sub w5,w5,#6 + add x7,x3,x5,lsl#4 // pointer to last 7 round keys + sub w5,w5,#2 + ld1 {v18.4s,v19.4s},[x7],#32 // load key schedule... + ld1 {v20.4s,v21.4s},[x7],#32 + ld1 {v22.4s,v23.4s},[x7],#32 + ld1 {v7.4s},[x7] + + // The iv for fourth block + extr x22,x10,x10,#32 + extr x10,x10,x9,#63 + and w11,w19,w22,asr #31 + eor x9,x11,x9,lsl #1 + fmov d10,x9 + fmov v10.d[1],x10 + + add x7,x3,#32 + mov w6,w5 + b .Lxts_dec + + // Decryption +.align 5 +.Lxts_dec: + tst x21,#0xf + b.eq .Lxts_dec_begin + subs x2,x2,#16 + csel x8,xzr,x8,eq + ld1 {v0.16b},[x0],#16 + b.lo .Lxts_done + sub x0,x0,#16 +.Lxts_dec_begin: + ld1 {v0.16b},[x0],x8 + subs x2,x2,#32 // bias + add w6,w5,#2 + orr v3.16b,v0.16b,v0.16b + orr v1.16b,v0.16b,v0.16b + orr v28.16b,v0.16b,v0.16b + ld1 {v24.16b},[x0],#16 + orr v27.16b,v24.16b,v24.16b + orr v29.16b,v24.16b,v24.16b + b.lo .Lxts_inner_dec_tail + eor v0.16b,v0.16b,v6.16b // before decryt, xor with iv + eor v24.16b,v24.16b,v8.16b + + orr v1.16b,v24.16b,v24.16b + ld1 {v24.16b},[x0],#16 + orr v2.16b,v0.16b,v0.16b + orr v3.16b,v1.16b,v1.16b + eor v27.16b,v24.16b,v9.16b // third block xox with third iv + eor v24.16b,v24.16b,v9.16b + cmp x2,#32 + b.lo .Lxts_outer_dec_tail + + ld1 {v25.16b},[x0],#16 + + // The iv for fifth block + extr x22,x10,x10,#32 + extr x10,x10,x9,#63 + and w11,w19,w22,asr #31 + eor x9,x11,x9,lsl #1 + fmov d11,x9 + fmov v11.d[1],x10 + + ld1 {v26.16b},[x0],#16 + eor v25.16b,v25.16b,v10.16b // the fourth block + eor v26.16b,v26.16b,v11.16b + sub x2,x2,#32 // bias + mov w6,w5 + b .Loop5x_xts_dec + +.align 4 +.Loop5x_xts_dec: + aesd v0.16b,v16.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v16.16b + aesimc v1.16b,v1.16b + aesd v24.16b,v16.16b + aesimc v24.16b,v24.16b + aesd v25.16b,v16.16b + aesimc v25.16b,v25.16b + aesd v26.16b,v16.16b + aesimc v26.16b,v26.16b + ld1 {v16.4s},[x7],#16 // load key schedule... + subs w6,w6,#2 + aesd v0.16b,v17.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v17.16b + aesimc v1.16b,v1.16b + aesd v24.16b,v17.16b + aesimc v24.16b,v24.16b + aesd v25.16b,v17.16b + aesimc v25.16b,v25.16b + aesd v26.16b,v17.16b + aesimc v26.16b,v26.16b + ld1 {v17.4s},[x7],#16 // load key schedule... + b.gt .Loop5x_xts_dec + + aesd v0.16b,v16.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v16.16b + aesimc v1.16b,v1.16b + aesd v24.16b,v16.16b + aesimc v24.16b,v24.16b + aesd v25.16b,v16.16b + aesimc v25.16b,v25.16b + aesd v26.16b,v16.16b + aesimc v26.16b,v26.16b + subs x2,x2,#0x50 // because .Lxts_dec_tail4x + + aesd v0.16b,v17.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v17.16b + aesimc v1.16b,v1.16b + aesd v24.16b,v17.16b + aesimc v24.16b,v24.16b + aesd v25.16b,v17.16b + aesimc v25.16b,v25.16b + aesd v26.16b,v17.16b + aesimc v26.16b,v26.16b + csel x6,xzr,x2,gt // borrow x6, w6, "gt" is not typo + mov x7,x3 + + aesd v0.16b,v18.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v18.16b + aesimc v1.16b,v1.16b + aesd v24.16b,v18.16b + aesimc v24.16b,v24.16b + aesd v25.16b,v18.16b + aesimc v25.16b,v25.16b + aesd v26.16b,v18.16b + aesimc v26.16b,v26.16b + add x0,x0,x6 // x0 is adjusted in such way that + // at exit from the loop v1.16b-v26.16b + // are loaded with last "words" + add x6,x2,#0x60 // because .Lxts_dec_tail4x + + aesd v0.16b,v19.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v19.16b + aesimc v1.16b,v1.16b + aesd v24.16b,v19.16b + aesimc v24.16b,v24.16b + aesd v25.16b,v19.16b + aesimc v25.16b,v25.16b + aesd v26.16b,v19.16b + aesimc v26.16b,v26.16b + + aesd v0.16b,v20.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v20.16b + aesimc v1.16b,v1.16b + aesd v24.16b,v20.16b + aesimc v24.16b,v24.16b + aesd v25.16b,v20.16b + aesimc v25.16b,v25.16b + aesd v26.16b,v20.16b + aesimc v26.16b,v26.16b + + aesd v0.16b,v21.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v21.16b + aesimc v1.16b,v1.16b + aesd v24.16b,v21.16b + aesimc v24.16b,v24.16b + aesd v25.16b,v21.16b + aesimc v25.16b,v25.16b + aesd v26.16b,v21.16b + aesimc v26.16b,v26.16b + + aesd v0.16b,v22.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v22.16b + aesimc v1.16b,v1.16b + aesd v24.16b,v22.16b + aesimc v24.16b,v24.16b + aesd v25.16b,v22.16b + aesimc v25.16b,v25.16b + aesd v26.16b,v22.16b + aesimc v26.16b,v26.16b + + eor v4.16b,v7.16b,v6.16b + aesd v0.16b,v23.16b + // The iv for first block of next iteration. + extr x22,x10,x10,#32 + extr x10,x10,x9,#63 + and w11,w19,w22,asr #31 + eor x9,x11,x9,lsl #1 + fmov d6,x9 + fmov v6.d[1],x10 + eor v5.16b,v7.16b,v8.16b + ld1 {v2.16b},[x0],#16 + aesd v1.16b,v23.16b + // The iv for second block + extr x22,x10,x10,#32 + extr x10,x10,x9,#63 + and w11,w19,w22,asr #31 + eor x9,x11,x9,lsl #1 + fmov d8,x9 + fmov v8.d[1],x10 + eor v17.16b,v7.16b,v9.16b + ld1 {v3.16b},[x0],#16 + aesd v24.16b,v23.16b + // The iv for third block + extr x22,x10,x10,#32 + extr x10,x10,x9,#63 + and w11,w19,w22,asr #31 + eor x9,x11,x9,lsl #1 + fmov d9,x9 + fmov v9.d[1],x10 + eor v30.16b,v7.16b,v10.16b + ld1 {v27.16b},[x0],#16 + aesd v25.16b,v23.16b + // The iv for fourth block + extr x22,x10,x10,#32 + extr x10,x10,x9,#63 + and w11,w19,w22,asr #31 + eor x9,x11,x9,lsl #1 + fmov d10,x9 + fmov v10.d[1],x10 + eor v31.16b,v7.16b,v11.16b + ld1 {v28.16b},[x0],#16 + aesd v26.16b,v23.16b + + // The iv for fifth block + extr x22,x10,x10,#32 + extr x10,x10,x9,#63 + and w11,w19,w22,asr #31 + eor x9,x11,x9,lsl #1 + fmov d11,x9 + fmov v11.d[1],x10 + + ld1 {v29.16b},[x0],#16 + cbz x6,.Lxts_dec_tail4x + ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] + eor v4.16b,v4.16b,v0.16b + eor v0.16b,v2.16b,v6.16b + eor v5.16b,v5.16b,v1.16b + eor v1.16b,v3.16b,v8.16b + eor v17.16b,v17.16b,v24.16b + eor v24.16b,v27.16b,v9.16b + eor v30.16b,v30.16b,v25.16b + eor v25.16b,v28.16b,v10.16b + eor v31.16b,v31.16b,v26.16b + st1 {v4.16b},[x1],#16 + eor v26.16b,v29.16b,v11.16b + st1 {v5.16b},[x1],#16 + mov w6,w5 + st1 {v17.16b},[x1],#16 + ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] + st1 {v30.16b},[x1],#16 + st1 {v31.16b},[x1],#16 + b.hs .Loop5x_xts_dec + + cmn x2,#0x10 + b.ne .Loop5x_dec_after + // If x2(x2) equal to -0x10, the left blocks is 4. + // After specially processing, utilize the five blocks processing again. + // It will use the following IVs: v6.16b,v6.16b,v8.16b,v9.16b,v10.16b. + orr v11.16b,v10.16b,v10.16b + orr v10.16b,v9.16b,v9.16b + orr v9.16b,v8.16b,v8.16b + orr v8.16b,v6.16b,v6.16b + fmov x9,d11 + fmov x10,v11.d[1] + eor v0.16b,v6.16b,v2.16b + eor v1.16b,v8.16b,v3.16b + eor v24.16b,v27.16b,v9.16b + eor v25.16b,v28.16b,v10.16b + eor v26.16b,v29.16b,v11.16b + b.eq .Loop5x_xts_dec + +.Loop5x_dec_after: + add x2,x2,#0x50 + cbz x2,.Lxts_done + + add w6,w5,#2 + subs x2,x2,#0x30 + b.lo .Lxts_inner_dec_tail + + eor v0.16b,v6.16b,v27.16b + eor v1.16b,v8.16b,v28.16b + eor v24.16b,v29.16b,v9.16b + b .Lxts_outer_dec_tail + +.align 4 +.Lxts_dec_tail4x: + add x0,x0,#16 + tst x21,#0xf + eor v5.16b,v1.16b,v4.16b + st1 {v5.16b},[x1],#16 + eor v17.16b,v24.16b,v17.16b + st1 {v17.16b},[x1],#16 + eor v30.16b,v25.16b,v30.16b + eor v31.16b,v26.16b,v31.16b + st1 {v30.16b,v31.16b},[x1],#32 + + b.eq .Lxts_dec_abort + ld1 {v0.16b},[x0],#16 + b .Lxts_done +.align 4 +.Lxts_outer_dec_tail: + aesd v0.16b,v16.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v16.16b + aesimc v1.16b,v1.16b + aesd v24.16b,v16.16b + aesimc v24.16b,v24.16b + ld1 {v16.4s},[x7],#16 + subs w6,w6,#2 + aesd v0.16b,v17.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v17.16b + aesimc v1.16b,v1.16b + aesd v24.16b,v17.16b + aesimc v24.16b,v24.16b + ld1 {v17.4s},[x7],#16 + b.gt .Lxts_outer_dec_tail + + aesd v0.16b,v16.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v16.16b + aesimc v1.16b,v1.16b + aesd v24.16b,v16.16b + aesimc v24.16b,v24.16b + eor v4.16b,v6.16b,v7.16b + subs x2,x2,#0x30 + // The iv for first block + fmov x9,d9 + fmov x10,v9.d[1] + mov w19,#0x87 + extr x22,x10,x10,#32 + extr x10,x10,x9,#63 + and w11,w19,w22,asr #31 + eor x9,x11,x9,lsl #1 + fmov d6,x9 + fmov v6.d[1],x10 + eor v5.16b,v8.16b,v7.16b + csel x6,x2,x6,lo // x6, w6, is zero at this point + aesd v0.16b,v17.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v17.16b + aesimc v1.16b,v1.16b + aesd v24.16b,v17.16b + aesimc v24.16b,v24.16b + eor v17.16b,v9.16b,v7.16b + // The iv for second block + extr x22,x10,x10,#32 + extr x10,x10,x9,#63 + and w11,w19,w22,asr #31 + eor x9,x11,x9,lsl #1 + fmov d8,x9 + fmov v8.d[1],x10 + + add x6,x6,#0x20 + add x0,x0,x6 // x0 is adjusted to the last data + + mov x7,x3 + + // The iv for third block + extr x22,x10,x10,#32 + extr x10,x10,x9,#63 + and w11,w19,w22,asr #31 + eor x9,x11,x9,lsl #1 + fmov d9,x9 + fmov v9.d[1],x10 + + aesd v0.16b,v20.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v20.16b + aesimc v1.16b,v1.16b + aesd v24.16b,v20.16b + aesimc v24.16b,v24.16b + aesd v0.16b,v21.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v21.16b + aesimc v1.16b,v1.16b + aesd v24.16b,v21.16b + aesimc v24.16b,v24.16b + aesd v0.16b,v22.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v22.16b + aesimc v1.16b,v1.16b + aesd v24.16b,v22.16b + aesimc v24.16b,v24.16b + ld1 {v27.16b},[x0],#16 + aesd v0.16b,v23.16b + aesd v1.16b,v23.16b + aesd v24.16b,v23.16b + ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] + add w6,w5,#2 + eor v4.16b,v4.16b,v0.16b + eor v5.16b,v5.16b,v1.16b + eor v24.16b,v24.16b,v17.16b + ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] + st1 {v4.16b},[x1],#16 + st1 {v5.16b},[x1],#16 + st1 {v24.16b},[x1],#16 + + cmn x2,#0x30 + add x2,x2,#0x30 + b.eq .Lxts_done + sub x2,x2,#0x30 + orr v28.16b,v3.16b,v3.16b + orr v29.16b,v27.16b,v27.16b + nop + +.Lxts_inner_dec_tail: + // x2 == -0x10 means two blocks left. + cmn x2,#0x10 + eor v1.16b,v28.16b,v6.16b + eor v24.16b,v29.16b,v8.16b + b.eq .Lxts_dec_tail_loop + eor v24.16b,v29.16b,v6.16b +.Lxts_dec_tail_loop: + aesd v1.16b,v16.16b + aesimc v1.16b,v1.16b + aesd v24.16b,v16.16b + aesimc v24.16b,v24.16b + ld1 {v16.4s},[x7],#16 + subs w6,w6,#2 + aesd v1.16b,v17.16b + aesimc v1.16b,v1.16b + aesd v24.16b,v17.16b + aesimc v24.16b,v24.16b + ld1 {v17.4s},[x7],#16 + b.gt .Lxts_dec_tail_loop + + aesd v1.16b,v16.16b + aesimc v1.16b,v1.16b + aesd v24.16b,v16.16b + aesimc v24.16b,v24.16b + aesd v1.16b,v17.16b + aesimc v1.16b,v1.16b + aesd v24.16b,v17.16b + aesimc v24.16b,v24.16b + aesd v1.16b,v20.16b + aesimc v1.16b,v1.16b + aesd v24.16b,v20.16b + aesimc v24.16b,v24.16b + cmn x2,#0x20 + aesd v1.16b,v21.16b + aesimc v1.16b,v1.16b + aesd v24.16b,v21.16b + aesimc v24.16b,v24.16b + eor v5.16b,v6.16b,v7.16b + aesd v1.16b,v22.16b + aesimc v1.16b,v1.16b + aesd v24.16b,v22.16b + aesimc v24.16b,v24.16b + eor v17.16b,v8.16b,v7.16b + aesd v1.16b,v23.16b + aesd v24.16b,v23.16b + b.eq .Lxts_dec_one + eor v5.16b,v5.16b,v1.16b + eor v17.16b,v17.16b,v24.16b + orr v6.16b,v9.16b,v9.16b + orr v8.16b,v10.16b,v10.16b + st1 {v5.16b},[x1],#16 + st1 {v17.16b},[x1],#16 + add x2,x2,#16 + b .Lxts_done + +.Lxts_dec_one: + eor v5.16b,v5.16b,v24.16b + orr v6.16b,v8.16b,v8.16b + orr v8.16b,v9.16b,v9.16b + st1 {v5.16b},[x1],#16 + add x2,x2,#32 + +.Lxts_done: + tst x21,#0xf + b.eq .Lxts_dec_abort + // Processing the last two blocks with cipher stealing. + mov x7,x3 + cbnz x2,.Lxts_dec_1st_done + ld1 {v0.16b},[x0],#16 + + // Decrypt the last secod block to get the last plain text block +.Lxts_dec_1st_done: + eor v26.16b,v0.16b,v8.16b + ldr w6,[x3,#240] + ld1 {v0.4s},[x3],#16 + sub w6,w6,#2 + ld1 {v1.4s},[x3],#16 +.Loop_final_2nd_dec: + aesd v26.16b,v0.16b + aesimc v26.16b,v26.16b + ld1 {v0.4s},[x3],#16 // load key schedule... + subs w6,w6,#2 + aesd v26.16b,v1.16b + aesimc v26.16b,v26.16b + ld1 {v1.4s},[x3],#16 // load key schedule... + b.gt .Loop_final_2nd_dec + + aesd v26.16b,v0.16b + aesimc v26.16b,v26.16b + ld1 {v0.4s},[x3] + aesd v26.16b,v1.16b + eor v26.16b,v26.16b,v0.16b + eor v26.16b,v26.16b,v8.16b + st1 {v26.16b},[x1] + + mov x20,x0 + add x13,x1,#16 + + // Composite the tailcnt "16 byte not aligned block" into the last second plain blocks + // to get the last encrypted block. +.composite_dec_loop: + subs x21,x21,#1 + ldrb w15,[x1,x21] + ldrb w14,[x20,x21] + strb w15,[x13,x21] + strb w14,[x1,x21] + b.gt .composite_dec_loop +.Lxts_dec_load_done: + ld1 {v26.16b},[x1] + eor v26.16b,v26.16b,v6.16b + + // Decrypt the composite block to get the last second plain text block + ldr w6,[x7,#240] + ld1 {v0.4s},[x7],#16 + sub w6,w6,#2 + ld1 {v1.4s},[x7],#16 +.Loop_final_dec: + aesd v26.16b,v0.16b + aesimc v26.16b,v26.16b + ld1 {v0.4s},[x7],#16 // load key schedule... + subs w6,w6,#2 + aesd v26.16b,v1.16b + aesimc v26.16b,v26.16b + ld1 {v1.4s},[x7],#16 // load key schedule... + b.gt .Loop_final_dec + + aesd v26.16b,v0.16b + aesimc v26.16b,v26.16b + ld1 {v0.4s},[x7] + aesd v26.16b,v1.16b + eor v26.16b,v26.16b,v0.16b + eor v26.16b,v26.16b,v6.16b + st1 {v26.16b},[x1] + +.Lxts_dec_abort: + ldp x21,x22,[sp,#48] + ldp d8,d9,[sp,#32] + ldp d10,d11,[sp,#16] + ldp x19,x20,[sp],#64 + +.Lxts_dec_final_abort: + ret +.size aes_v8_xts_decrypt,.-aes_v8_xts_decrypt #endif diff --git a/sys/crypto/openssl/aarch64/arm64cpuid.S b/sys/crypto/openssl/aarch64/arm64cpuid.S --- a/sys/crypto/openssl/aarch64/arm64cpuid.S +++ b/sys/crypto/openssl/aarch64/arm64cpuid.S @@ -58,6 +58,13 @@ ret .size _armv8_sha512_probe,.-_armv8_sha512_probe +.globl _armv8_cpuid_probe +.type _armv8_cpuid_probe,%function +_armv8_cpuid_probe: + mrs x0, midr_el1 + ret +.size _armv8_cpuid_probe,.-_armv8_cpuid_probe + .globl OPENSSL_cleanse .type OPENSSL_cleanse,%function .align 5 diff --git a/sys/crypto/openssl/aarch64/armv8-mont.S b/sys/crypto/openssl/aarch64/armv8-mont.S --- a/sys/crypto/openssl/aarch64/armv8-mont.S +++ b/sys/crypto/openssl/aarch64/armv8-mont.S @@ -1,14 +1,32 @@ /* Do not modify. This file is auto-generated from armv8-mont.pl. */ +#ifndef __KERNEL__ +# include "arm_arch.h" + +.hidden OPENSSL_armv8_rsa_neonized +#endif .text .globl bn_mul_mont .type bn_mul_mont,%function .align 5 bn_mul_mont: +.Lbn_mul_mont: + tst x5,#3 + b.ne .Lmul_mont + cmp x5,#32 + b.le .Lscalar_impl +#ifndef __KERNEL__ + adrp x17,OPENSSL_armv8_rsa_neonized + ldr w17,[x17,#:lo12:OPENSSL_armv8_rsa_neonized] + cbnz w17, bn_mul8x_mont_neon +#endif + +.Lscalar_impl: tst x5,#7 b.eq __bn_sqr8x_mont tst x5,#3 b.eq __bn_mul4x_mont + .Lmul_mont: stp x29,x30,[sp,#-64]! add x29,sp,#0 @@ -132,7 +150,7 @@ mul x16,x14,x15 // np[j]*m1 adds x12,x12,x6 umulh x17,x14,x15 - str x12,[x22,#-16] // tp[j-1] + stur x12,[x22,#-16] // tp[j-1] cbnz x21,.Linner .Linner_skip: @@ -188,13 +206,13 @@ csel x14,x23,x8,lo // did it borrow? ldr x23,[x22],#8 ldr x8,[x0],#8 - str xzr,[x22,#-16] // wipe tp - str x14,[x0,#-16] + stur xzr,[x22,#-16] // wipe tp + stur x14,[x0,#-16] cbnz x5,.Lcond_copy csel x14,x23,x8,lo - str xzr,[x22,#-8] // wipe tp - str x14,[x0,#-8] + stur xzr,[x22,#-8] // wipe tp + stur x14,[x0,#-8] ldp x19,x20,[x29,#16] mov sp,x29 @@ -204,6 +222,704 @@ ldr x29,[sp],#64 ret .size bn_mul_mont,.-bn_mul_mont +.type bn_mul8x_mont_neon,%function +.align 5 +bn_mul8x_mont_neon: + stp x29,x30,[sp,#-80]! + mov x16,sp + stp d8,d9,[sp,#16] + stp d10,d11,[sp,#32] + stp d12,d13,[sp,#48] + stp d14,d15,[sp,#64] + lsl x5,x5,#1 + eor v14.16b,v14.16b,v14.16b + +.align 4 +.LNEON_8n: + eor v6.16b,v6.16b,v6.16b + sub x7,sp,#128 + eor v7.16b,v7.16b,v7.16b + sub x7,x7,x5,lsl#4 + eor v8.16b,v8.16b,v8.16b + and x7,x7,#-64 + eor v9.16b,v9.16b,v9.16b + mov sp,x7 // alloca + eor v10.16b,v10.16b,v10.16b + add x7,x7,#256 + eor v11.16b,v11.16b,v11.16b + sub x8,x5,#8 + eor v12.16b,v12.16b,v12.16b + eor v13.16b,v13.16b,v13.16b + +.LNEON_8n_init: + st1 {v6.2d,v7.2d},[x7],#32 + subs x8,x8,#8 + st1 {v8.2d,v9.2d},[x7],#32 + st1 {v10.2d,v11.2d},[x7],#32 + st1 {v12.2d,v13.2d},[x7],#32 + bne .LNEON_8n_init + + add x6,sp,#256 + ld1 {v0.4s,v1.4s},[x1],#32 + add x10,sp,#8 + ldr s30,[x4],#4 + mov x9,x5 + b .LNEON_8n_outer + +.align 4 +.LNEON_8n_outer: + ldr s28,[x2],#4 // *b++ + uxtl v28.4s,v28.4h + add x7,sp,#128 + ld1 {v2.4s,v3.4s},[x3],#32 + + umlal v6.2d,v28.2s,v0.s[0] + umlal v7.2d,v28.2s,v0.s[1] + umlal v8.2d,v28.2s,v0.s[2] + shl v29.2d,v6.2d,#16 + ext v29.16b,v29.16b,v29.16b,#8 + umlal v9.2d,v28.2s,v0.s[3] + add v29.2d,v29.2d,v6.2d + umlal v10.2d,v28.2s,v1.s[0] + mul v29.2s,v29.2s,v30.2s + umlal v11.2d,v28.2s,v1.s[1] + st1 {v28.2s},[sp] // put aside smashed b[8*i+0] + umlal v12.2d,v28.2s,v1.s[2] + uxtl v29.4s,v29.4h + umlal v13.2d,v28.2s,v1.s[3] + ldr s28,[x2],#4 // *b++ + umlal v6.2d,v29.2s,v2.s[0] + umlal v7.2d,v29.2s,v2.s[1] + uxtl v28.4s,v28.4h + umlal v8.2d,v29.2s,v2.s[2] + ushr v15.2d,v6.2d,#16 + umlal v9.2d,v29.2s,v2.s[3] + umlal v10.2d,v29.2s,v3.s[0] + ext v6.16b,v6.16b,v6.16b,#8 + add v6.2d,v6.2d,v15.2d + umlal v11.2d,v29.2s,v3.s[1] + ushr v6.2d,v6.2d,#16 + umlal v12.2d,v29.2s,v3.s[2] + umlal v13.2d,v29.2s,v3.s[3] + add v16.2d,v7.2d,v6.2d + ins v7.d[0],v16.d[0] + st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+0] + umlal v7.2d,v28.2s,v0.s[0] + ld1 {v6.2d},[x6],#16 + umlal v8.2d,v28.2s,v0.s[1] + umlal v9.2d,v28.2s,v0.s[2] + shl v29.2d,v7.2d,#16 + ext v29.16b,v29.16b,v29.16b,#8 + umlal v10.2d,v28.2s,v0.s[3] + add v29.2d,v29.2d,v7.2d + umlal v11.2d,v28.2s,v1.s[0] + mul v29.2s,v29.2s,v30.2s + umlal v12.2d,v28.2s,v1.s[1] + st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+1] + umlal v13.2d,v28.2s,v1.s[2] + uxtl v29.4s,v29.4h + umlal v6.2d,v28.2s,v1.s[3] + ldr s28,[x2],#4 // *b++ + umlal v7.2d,v29.2s,v2.s[0] + umlal v8.2d,v29.2s,v2.s[1] + uxtl v28.4s,v28.4h + umlal v9.2d,v29.2s,v2.s[2] + ushr v15.2d,v7.2d,#16 + umlal v10.2d,v29.2s,v2.s[3] + umlal v11.2d,v29.2s,v3.s[0] + ext v7.16b,v7.16b,v7.16b,#8 + add v7.2d,v7.2d,v15.2d + umlal v12.2d,v29.2s,v3.s[1] + ushr v7.2d,v7.2d,#16 + umlal v13.2d,v29.2s,v3.s[2] + umlal v6.2d,v29.2s,v3.s[3] + add v16.2d,v8.2d,v7.2d + ins v8.d[0],v16.d[0] + st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+1] + umlal v8.2d,v28.2s,v0.s[0] + ld1 {v7.2d},[x6],#16 + umlal v9.2d,v28.2s,v0.s[1] + umlal v10.2d,v28.2s,v0.s[2] + shl v29.2d,v8.2d,#16 + ext v29.16b,v29.16b,v29.16b,#8 + umlal v11.2d,v28.2s,v0.s[3] + add v29.2d,v29.2d,v8.2d + umlal v12.2d,v28.2s,v1.s[0] + mul v29.2s,v29.2s,v30.2s + umlal v13.2d,v28.2s,v1.s[1] + st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+2] + umlal v6.2d,v28.2s,v1.s[2] + uxtl v29.4s,v29.4h + umlal v7.2d,v28.2s,v1.s[3] + ldr s28,[x2],#4 // *b++ + umlal v8.2d,v29.2s,v2.s[0] + umlal v9.2d,v29.2s,v2.s[1] + uxtl v28.4s,v28.4h + umlal v10.2d,v29.2s,v2.s[2] + ushr v15.2d,v8.2d,#16 + umlal v11.2d,v29.2s,v2.s[3] + umlal v12.2d,v29.2s,v3.s[0] + ext v8.16b,v8.16b,v8.16b,#8 + add v8.2d,v8.2d,v15.2d + umlal v13.2d,v29.2s,v3.s[1] + ushr v8.2d,v8.2d,#16 + umlal v6.2d,v29.2s,v3.s[2] + umlal v7.2d,v29.2s,v3.s[3] + add v16.2d,v9.2d,v8.2d + ins v9.d[0],v16.d[0] + st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+2] + umlal v9.2d,v28.2s,v0.s[0] + ld1 {v8.2d},[x6],#16 + umlal v10.2d,v28.2s,v0.s[1] + umlal v11.2d,v28.2s,v0.s[2] + shl v29.2d,v9.2d,#16 + ext v29.16b,v29.16b,v29.16b,#8 + umlal v12.2d,v28.2s,v0.s[3] + add v29.2d,v29.2d,v9.2d + umlal v13.2d,v28.2s,v1.s[0] + mul v29.2s,v29.2s,v30.2s + umlal v6.2d,v28.2s,v1.s[1] + st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+3] + umlal v7.2d,v28.2s,v1.s[2] + uxtl v29.4s,v29.4h + umlal v8.2d,v28.2s,v1.s[3] + ldr s28,[x2],#4 // *b++ + umlal v9.2d,v29.2s,v2.s[0] + umlal v10.2d,v29.2s,v2.s[1] + uxtl v28.4s,v28.4h + umlal v11.2d,v29.2s,v2.s[2] + ushr v15.2d,v9.2d,#16 + umlal v12.2d,v29.2s,v2.s[3] + umlal v13.2d,v29.2s,v3.s[0] + ext v9.16b,v9.16b,v9.16b,#8 + add v9.2d,v9.2d,v15.2d + umlal v6.2d,v29.2s,v3.s[1] + ushr v9.2d,v9.2d,#16 + umlal v7.2d,v29.2s,v3.s[2] + umlal v8.2d,v29.2s,v3.s[3] + add v16.2d,v10.2d,v9.2d + ins v10.d[0],v16.d[0] + st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+3] + umlal v10.2d,v28.2s,v0.s[0] + ld1 {v9.2d},[x6],#16 + umlal v11.2d,v28.2s,v0.s[1] + umlal v12.2d,v28.2s,v0.s[2] + shl v29.2d,v10.2d,#16 + ext v29.16b,v29.16b,v29.16b,#8 + umlal v13.2d,v28.2s,v0.s[3] + add v29.2d,v29.2d,v10.2d + umlal v6.2d,v28.2s,v1.s[0] + mul v29.2s,v29.2s,v30.2s + umlal v7.2d,v28.2s,v1.s[1] + st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+4] + umlal v8.2d,v28.2s,v1.s[2] + uxtl v29.4s,v29.4h + umlal v9.2d,v28.2s,v1.s[3] + ldr s28,[x2],#4 // *b++ + umlal v10.2d,v29.2s,v2.s[0] + umlal v11.2d,v29.2s,v2.s[1] + uxtl v28.4s,v28.4h + umlal v12.2d,v29.2s,v2.s[2] + ushr v15.2d,v10.2d,#16 + umlal v13.2d,v29.2s,v2.s[3] + umlal v6.2d,v29.2s,v3.s[0] + ext v10.16b,v10.16b,v10.16b,#8 + add v10.2d,v10.2d,v15.2d + umlal v7.2d,v29.2s,v3.s[1] + ushr v10.2d,v10.2d,#16 + umlal v8.2d,v29.2s,v3.s[2] + umlal v9.2d,v29.2s,v3.s[3] + add v16.2d,v11.2d,v10.2d + ins v11.d[0],v16.d[0] + st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+4] + umlal v11.2d,v28.2s,v0.s[0] + ld1 {v10.2d},[x6],#16 + umlal v12.2d,v28.2s,v0.s[1] + umlal v13.2d,v28.2s,v0.s[2] + shl v29.2d,v11.2d,#16 + ext v29.16b,v29.16b,v29.16b,#8 + umlal v6.2d,v28.2s,v0.s[3] + add v29.2d,v29.2d,v11.2d + umlal v7.2d,v28.2s,v1.s[0] + mul v29.2s,v29.2s,v30.2s + umlal v8.2d,v28.2s,v1.s[1] + st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+5] + umlal v9.2d,v28.2s,v1.s[2] + uxtl v29.4s,v29.4h + umlal v10.2d,v28.2s,v1.s[3] + ldr s28,[x2],#4 // *b++ + umlal v11.2d,v29.2s,v2.s[0] + umlal v12.2d,v29.2s,v2.s[1] + uxtl v28.4s,v28.4h + umlal v13.2d,v29.2s,v2.s[2] + ushr v15.2d,v11.2d,#16 + umlal v6.2d,v29.2s,v2.s[3] + umlal v7.2d,v29.2s,v3.s[0] + ext v11.16b,v11.16b,v11.16b,#8 + add v11.2d,v11.2d,v15.2d + umlal v8.2d,v29.2s,v3.s[1] + ushr v11.2d,v11.2d,#16 + umlal v9.2d,v29.2s,v3.s[2] + umlal v10.2d,v29.2s,v3.s[3] + add v16.2d,v12.2d,v11.2d + ins v12.d[0],v16.d[0] + st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+5] + umlal v12.2d,v28.2s,v0.s[0] + ld1 {v11.2d},[x6],#16 + umlal v13.2d,v28.2s,v0.s[1] + umlal v6.2d,v28.2s,v0.s[2] + shl v29.2d,v12.2d,#16 + ext v29.16b,v29.16b,v29.16b,#8 + umlal v7.2d,v28.2s,v0.s[3] + add v29.2d,v29.2d,v12.2d + umlal v8.2d,v28.2s,v1.s[0] + mul v29.2s,v29.2s,v30.2s + umlal v9.2d,v28.2s,v1.s[1] + st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+6] + umlal v10.2d,v28.2s,v1.s[2] + uxtl v29.4s,v29.4h + umlal v11.2d,v28.2s,v1.s[3] + ldr s28,[x2],#4 // *b++ + umlal v12.2d,v29.2s,v2.s[0] + umlal v13.2d,v29.2s,v2.s[1] + uxtl v28.4s,v28.4h + umlal v6.2d,v29.2s,v2.s[2] + ushr v15.2d,v12.2d,#16 + umlal v7.2d,v29.2s,v2.s[3] + umlal v8.2d,v29.2s,v3.s[0] + ext v12.16b,v12.16b,v12.16b,#8 + add v12.2d,v12.2d,v15.2d + umlal v9.2d,v29.2s,v3.s[1] + ushr v12.2d,v12.2d,#16 + umlal v10.2d,v29.2s,v3.s[2] + umlal v11.2d,v29.2s,v3.s[3] + add v16.2d,v13.2d,v12.2d + ins v13.d[0],v16.d[0] + st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+6] + umlal v13.2d,v28.2s,v0.s[0] + ld1 {v12.2d},[x6],#16 + umlal v6.2d,v28.2s,v0.s[1] + umlal v7.2d,v28.2s,v0.s[2] + shl v29.2d,v13.2d,#16 + ext v29.16b,v29.16b,v29.16b,#8 + umlal v8.2d,v28.2s,v0.s[3] + add v29.2d,v29.2d,v13.2d + umlal v9.2d,v28.2s,v1.s[0] + mul v29.2s,v29.2s,v30.2s + umlal v10.2d,v28.2s,v1.s[1] + st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+7] + umlal v11.2d,v28.2s,v1.s[2] + uxtl v29.4s,v29.4h + umlal v12.2d,v28.2s,v1.s[3] + ld1 {v28.2s},[sp] // pull smashed b[8*i+0] + umlal v13.2d,v29.2s,v2.s[0] + ld1 {v0.4s,v1.4s},[x1],#32 + umlal v6.2d,v29.2s,v2.s[1] + umlal v7.2d,v29.2s,v2.s[2] + mov v5.16b,v13.16b + ushr v5.2d,v5.2d,#16 + ext v13.16b,v13.16b,v13.16b,#8 + umlal v8.2d,v29.2s,v2.s[3] + umlal v9.2d,v29.2s,v3.s[0] + add v13.2d,v13.2d,v5.2d + umlal v10.2d,v29.2s,v3.s[1] + ushr v13.2d,v13.2d,#16 + eor v15.16b,v15.16b,v15.16b + ins v13.d[1],v15.d[0] + umlal v11.2d,v29.2s,v3.s[2] + umlal v12.2d,v29.2s,v3.s[3] + add v6.2d,v6.2d,v13.2d + st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+7] + add x10,sp,#8 // rewind + sub x8,x5,#8 + b .LNEON_8n_inner + +.align 4 +.LNEON_8n_inner: + subs x8,x8,#8 + umlal v6.2d,v28.2s,v0.s[0] + ld1 {v13.2d},[x6] + umlal v7.2d,v28.2s,v0.s[1] + ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+0] + umlal v8.2d,v28.2s,v0.s[2] + ld1 {v2.4s,v3.4s},[x3],#32 + umlal v9.2d,v28.2s,v0.s[3] + b.eq .LInner_jump + add x6,x6,#16 // don't advance in last iteration +.LInner_jump: + umlal v10.2d,v28.2s,v1.s[0] + umlal v11.2d,v28.2s,v1.s[1] + umlal v12.2d,v28.2s,v1.s[2] + umlal v13.2d,v28.2s,v1.s[3] + ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+1] + umlal v6.2d,v29.2s,v2.s[0] + umlal v7.2d,v29.2s,v2.s[1] + umlal v8.2d,v29.2s,v2.s[2] + umlal v9.2d,v29.2s,v2.s[3] + umlal v10.2d,v29.2s,v3.s[0] + umlal v11.2d,v29.2s,v3.s[1] + umlal v12.2d,v29.2s,v3.s[2] + umlal v13.2d,v29.2s,v3.s[3] + st1 {v6.2d},[x7],#16 + umlal v7.2d,v28.2s,v0.s[0] + ld1 {v6.2d},[x6] + umlal v8.2d,v28.2s,v0.s[1] + ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+1] + umlal v9.2d,v28.2s,v0.s[2] + b.eq .LInner_jump1 + add x6,x6,#16 // don't advance in last iteration +.LInner_jump1: + umlal v10.2d,v28.2s,v0.s[3] + umlal v11.2d,v28.2s,v1.s[0] + umlal v12.2d,v28.2s,v1.s[1] + umlal v13.2d,v28.2s,v1.s[2] + umlal v6.2d,v28.2s,v1.s[3] + ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+2] + umlal v7.2d,v29.2s,v2.s[0] + umlal v8.2d,v29.2s,v2.s[1] + umlal v9.2d,v29.2s,v2.s[2] + umlal v10.2d,v29.2s,v2.s[3] + umlal v11.2d,v29.2s,v3.s[0] + umlal v12.2d,v29.2s,v3.s[1] + umlal v13.2d,v29.2s,v3.s[2] + umlal v6.2d,v29.2s,v3.s[3] + st1 {v7.2d},[x7],#16 + umlal v8.2d,v28.2s,v0.s[0] + ld1 {v7.2d},[x6] + umlal v9.2d,v28.2s,v0.s[1] + ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+2] + umlal v10.2d,v28.2s,v0.s[2] + b.eq .LInner_jump2 + add x6,x6,#16 // don't advance in last iteration +.LInner_jump2: + umlal v11.2d,v28.2s,v0.s[3] + umlal v12.2d,v28.2s,v1.s[0] + umlal v13.2d,v28.2s,v1.s[1] + umlal v6.2d,v28.2s,v1.s[2] + umlal v7.2d,v28.2s,v1.s[3] + ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+3] + umlal v8.2d,v29.2s,v2.s[0] + umlal v9.2d,v29.2s,v2.s[1] + umlal v10.2d,v29.2s,v2.s[2] + umlal v11.2d,v29.2s,v2.s[3] + umlal v12.2d,v29.2s,v3.s[0] + umlal v13.2d,v29.2s,v3.s[1] + umlal v6.2d,v29.2s,v3.s[2] + umlal v7.2d,v29.2s,v3.s[3] + st1 {v8.2d},[x7],#16 + umlal v9.2d,v28.2s,v0.s[0] + ld1 {v8.2d},[x6] + umlal v10.2d,v28.2s,v0.s[1] + ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+3] + umlal v11.2d,v28.2s,v0.s[2] + b.eq .LInner_jump3 + add x6,x6,#16 // don't advance in last iteration +.LInner_jump3: + umlal v12.2d,v28.2s,v0.s[3] + umlal v13.2d,v28.2s,v1.s[0] + umlal v6.2d,v28.2s,v1.s[1] + umlal v7.2d,v28.2s,v1.s[2] + umlal v8.2d,v28.2s,v1.s[3] + ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+4] + umlal v9.2d,v29.2s,v2.s[0] + umlal v10.2d,v29.2s,v2.s[1] + umlal v11.2d,v29.2s,v2.s[2] + umlal v12.2d,v29.2s,v2.s[3] + umlal v13.2d,v29.2s,v3.s[0] + umlal v6.2d,v29.2s,v3.s[1] + umlal v7.2d,v29.2s,v3.s[2] + umlal v8.2d,v29.2s,v3.s[3] + st1 {v9.2d},[x7],#16 + umlal v10.2d,v28.2s,v0.s[0] + ld1 {v9.2d},[x6] + umlal v11.2d,v28.2s,v0.s[1] + ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+4] + umlal v12.2d,v28.2s,v0.s[2] + b.eq .LInner_jump4 + add x6,x6,#16 // don't advance in last iteration +.LInner_jump4: + umlal v13.2d,v28.2s,v0.s[3] + umlal v6.2d,v28.2s,v1.s[0] + umlal v7.2d,v28.2s,v1.s[1] + umlal v8.2d,v28.2s,v1.s[2] + umlal v9.2d,v28.2s,v1.s[3] + ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+5] + umlal v10.2d,v29.2s,v2.s[0] + umlal v11.2d,v29.2s,v2.s[1] + umlal v12.2d,v29.2s,v2.s[2] + umlal v13.2d,v29.2s,v2.s[3] + umlal v6.2d,v29.2s,v3.s[0] + umlal v7.2d,v29.2s,v3.s[1] + umlal v8.2d,v29.2s,v3.s[2] + umlal v9.2d,v29.2s,v3.s[3] + st1 {v10.2d},[x7],#16 + umlal v11.2d,v28.2s,v0.s[0] + ld1 {v10.2d},[x6] + umlal v12.2d,v28.2s,v0.s[1] + ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+5] + umlal v13.2d,v28.2s,v0.s[2] + b.eq .LInner_jump5 + add x6,x6,#16 // don't advance in last iteration +.LInner_jump5: + umlal v6.2d,v28.2s,v0.s[3] + umlal v7.2d,v28.2s,v1.s[0] + umlal v8.2d,v28.2s,v1.s[1] + umlal v9.2d,v28.2s,v1.s[2] + umlal v10.2d,v28.2s,v1.s[3] + ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+6] + umlal v11.2d,v29.2s,v2.s[0] + umlal v12.2d,v29.2s,v2.s[1] + umlal v13.2d,v29.2s,v2.s[2] + umlal v6.2d,v29.2s,v2.s[3] + umlal v7.2d,v29.2s,v3.s[0] + umlal v8.2d,v29.2s,v3.s[1] + umlal v9.2d,v29.2s,v3.s[2] + umlal v10.2d,v29.2s,v3.s[3] + st1 {v11.2d},[x7],#16 + umlal v12.2d,v28.2s,v0.s[0] + ld1 {v11.2d},[x6] + umlal v13.2d,v28.2s,v0.s[1] + ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+6] + umlal v6.2d,v28.2s,v0.s[2] + b.eq .LInner_jump6 + add x6,x6,#16 // don't advance in last iteration +.LInner_jump6: + umlal v7.2d,v28.2s,v0.s[3] + umlal v8.2d,v28.2s,v1.s[0] + umlal v9.2d,v28.2s,v1.s[1] + umlal v10.2d,v28.2s,v1.s[2] + umlal v11.2d,v28.2s,v1.s[3] + ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+7] + umlal v12.2d,v29.2s,v2.s[0] + umlal v13.2d,v29.2s,v2.s[1] + umlal v6.2d,v29.2s,v2.s[2] + umlal v7.2d,v29.2s,v2.s[3] + umlal v8.2d,v29.2s,v3.s[0] + umlal v9.2d,v29.2s,v3.s[1] + umlal v10.2d,v29.2s,v3.s[2] + umlal v11.2d,v29.2s,v3.s[3] + st1 {v12.2d},[x7],#16 + umlal v13.2d,v28.2s,v0.s[0] + ld1 {v12.2d},[x6] + umlal v6.2d,v28.2s,v0.s[1] + ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+7] + umlal v7.2d,v28.2s,v0.s[2] + b.eq .LInner_jump7 + add x6,x6,#16 // don't advance in last iteration +.LInner_jump7: + umlal v8.2d,v28.2s,v0.s[3] + umlal v9.2d,v28.2s,v1.s[0] + umlal v10.2d,v28.2s,v1.s[1] + umlal v11.2d,v28.2s,v1.s[2] + umlal v12.2d,v28.2s,v1.s[3] + b.ne .LInner_after_rewind8 + sub x1,x1,x5,lsl#2 // rewind +.LInner_after_rewind8: + umlal v13.2d,v29.2s,v2.s[0] + ld1 {v28.2s},[sp] // pull smashed b[8*i+0] + umlal v6.2d,v29.2s,v2.s[1] + ld1 {v0.4s,v1.4s},[x1],#32 + umlal v7.2d,v29.2s,v2.s[2] + add x10,sp,#8 // rewind + umlal v8.2d,v29.2s,v2.s[3] + umlal v9.2d,v29.2s,v3.s[0] + umlal v10.2d,v29.2s,v3.s[1] + umlal v11.2d,v29.2s,v3.s[2] + st1 {v13.2d},[x7],#16 + umlal v12.2d,v29.2s,v3.s[3] + + bne .LNEON_8n_inner + add x6,sp,#128 + st1 {v6.2d,v7.2d},[x7],#32 + eor v2.16b,v2.16b,v2.16b // v2 + st1 {v8.2d,v9.2d},[x7],#32 + eor v3.16b,v3.16b,v3.16b // v3 + st1 {v10.2d,v11.2d},[x7],#32 + st1 {v12.2d},[x7] + + subs x9,x9,#8 + ld1 {v6.2d,v7.2d},[x6],#32 + ld1 {v8.2d,v9.2d},[x6],#32 + ld1 {v10.2d,v11.2d},[x6],#32 + ld1 {v12.2d,v13.2d},[x6],#32 + + b.eq .LInner_8n_jump_2steps + sub x3,x3,x5,lsl#2 // rewind + b .LNEON_8n_outer + +.LInner_8n_jump_2steps: + add x7,sp,#128 + st1 {v2.2d,v3.2d}, [sp],#32 // start wiping stack frame + mov v5.16b,v6.16b + ushr v15.2d,v6.2d,#16 + ext v6.16b,v6.16b,v6.16b,#8 + st1 {v2.2d,v3.2d}, [sp],#32 + add v6.2d,v6.2d,v15.2d + st1 {v2.2d,v3.2d}, [sp],#32 + ushr v15.2d,v6.2d,#16 + st1 {v2.2d,v3.2d}, [sp],#32 + zip1 v6.4h,v5.4h,v6.4h + ins v15.d[1],v14.d[0] + + mov x8,x5 + b .LNEON_tail_entry + +.align 4 +.LNEON_tail: + add v6.2d,v6.2d,v15.2d + mov v5.16b,v6.16b + ushr v15.2d,v6.2d,#16 + ext v6.16b,v6.16b,v6.16b,#8 + ld1 {v8.2d,v9.2d}, [x6],#32 + add v6.2d,v6.2d,v15.2d + ld1 {v10.2d,v11.2d}, [x6],#32 + ushr v15.2d,v6.2d,#16 + ld1 {v12.2d,v13.2d}, [x6],#32 + zip1 v6.4h,v5.4h,v6.4h + ins v15.d[1],v14.d[0] + +.LNEON_tail_entry: + add v7.2d,v7.2d,v15.2d + st1 {v6.s}[0], [x7],#4 + ushr v15.2d,v7.2d,#16 + mov v5.16b,v7.16b + ext v7.16b,v7.16b,v7.16b,#8 + add v7.2d,v7.2d,v15.2d + ushr v15.2d,v7.2d,#16 + zip1 v7.4h,v5.4h,v7.4h + ins v15.d[1],v14.d[0] + add v8.2d,v8.2d,v15.2d + st1 {v7.s}[0], [x7],#4 + ushr v15.2d,v8.2d,#16 + mov v5.16b,v8.16b + ext v8.16b,v8.16b,v8.16b,#8 + add v8.2d,v8.2d,v15.2d + ushr v15.2d,v8.2d,#16 + zip1 v8.4h,v5.4h,v8.4h + ins v15.d[1],v14.d[0] + add v9.2d,v9.2d,v15.2d + st1 {v8.s}[0], [x7],#4 + ushr v15.2d,v9.2d,#16 + mov v5.16b,v9.16b + ext v9.16b,v9.16b,v9.16b,#8 + add v9.2d,v9.2d,v15.2d + ushr v15.2d,v9.2d,#16 + zip1 v9.4h,v5.4h,v9.4h + ins v15.d[1],v14.d[0] + add v10.2d,v10.2d,v15.2d + st1 {v9.s}[0], [x7],#4 + ushr v15.2d,v10.2d,#16 + mov v5.16b,v10.16b + ext v10.16b,v10.16b,v10.16b,#8 + add v10.2d,v10.2d,v15.2d + ushr v15.2d,v10.2d,#16 + zip1 v10.4h,v5.4h,v10.4h + ins v15.d[1],v14.d[0] + add v11.2d,v11.2d,v15.2d + st1 {v10.s}[0], [x7],#4 + ushr v15.2d,v11.2d,#16 + mov v5.16b,v11.16b + ext v11.16b,v11.16b,v11.16b,#8 + add v11.2d,v11.2d,v15.2d + ushr v15.2d,v11.2d,#16 + zip1 v11.4h,v5.4h,v11.4h + ins v15.d[1],v14.d[0] + add v12.2d,v12.2d,v15.2d + st1 {v11.s}[0], [x7],#4 + ushr v15.2d,v12.2d,#16 + mov v5.16b,v12.16b + ext v12.16b,v12.16b,v12.16b,#8 + add v12.2d,v12.2d,v15.2d + ushr v15.2d,v12.2d,#16 + zip1 v12.4h,v5.4h,v12.4h + ins v15.d[1],v14.d[0] + add v13.2d,v13.2d,v15.2d + st1 {v12.s}[0], [x7],#4 + ushr v15.2d,v13.2d,#16 + mov v5.16b,v13.16b + ext v13.16b,v13.16b,v13.16b,#8 + add v13.2d,v13.2d,v15.2d + ushr v15.2d,v13.2d,#16 + zip1 v13.4h,v5.4h,v13.4h + ins v15.d[1],v14.d[0] + ld1 {v6.2d,v7.2d}, [x6],#32 + subs x8,x8,#8 + st1 {v13.s}[0], [x7],#4 + bne .LNEON_tail + + st1 {v15.s}[0], [x7],#4 // top-most bit + sub x3,x3,x5,lsl#2 // rewind x3 + subs x1,sp,#0 // clear carry flag + add x2,sp,x5,lsl#2 + +.LNEON_sub: + ldp w4,w5,[x1],#8 + ldp w6,w7,[x1],#8 + ldp w8,w9,[x3],#8 + ldp w10,w11,[x3],#8 + sbcs w8,w4,w8 + sbcs w9,w5,w9 + sbcs w10,w6,w10 + sbcs w11,w7,w11 + sub x17,x2,x1 + stp w8,w9,[x0],#8 + stp w10,w11,[x0],#8 + cbnz x17,.LNEON_sub + + ldr w10, [x1] // load top-most bit + mov x11,sp + eor v0.16b,v0.16b,v0.16b + sub x11,x2,x11 // this is num*4 + eor v1.16b,v1.16b,v1.16b + mov x1,sp + sub x0,x0,x11 // rewind x0 + mov x3,x2 // second 3/4th of frame + sbcs w10,w10,wzr // result is carry flag + +.LNEON_copy_n_zap: + ldp w4,w5,[x1],#8 + ldp w6,w7,[x1],#8 + ldp w8,w9,[x0],#8 + ldp w10,w11,[x0] + sub x0,x0,#8 + b.cs .LCopy_1 + mov w8,w4 + mov w9,w5 + mov w10,w6 + mov w11,w7 +.LCopy_1: + st1 {v0.2d,v1.2d}, [x3],#32 // wipe + st1 {v0.2d,v1.2d}, [x3],#32 // wipe + ldp w4,w5,[x1],#8 + ldp w6,w7,[x1],#8 + stp w8,w9,[x0],#8 + stp w10,w11,[x0],#8 + sub x1,x1,#32 + ldp w8,w9,[x0],#8 + ldp w10,w11,[x0] + sub x0,x0,#8 + b.cs .LCopy_2 + mov w8, w4 + mov w9, w5 + mov w10, w6 + mov w11, w7 +.LCopy_2: + st1 {v0.2d,v1.2d}, [x1],#32 // wipe + st1 {v0.2d,v1.2d}, [x3],#32 // wipe + sub x17,x2,x1 // preserves carry + stp w8,w9,[x0],#8 + stp w10,w11,[x0],#8 + cbnz x17,.LNEON_copy_n_zap + + mov sp,x16 + ldp d14,d15,[sp,#64] + ldp d12,d13,[sp,#48] + ldp d10,d11,[sp,#32] + ldp d8,d9,[sp,#16] + ldr x29,[sp],#80 + ret // bx lr + +.size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon .type __bn_sqr8x_mont,%function .align 5 __bn_sqr8x_mont: @@ -519,7 +1235,7 @@ ldp x10,x11,[x2,#8*4] ldp x12,x13,[x2,#8*6] adds x19,x19,x6 - ldr x4,[x0,#-8*8] + ldur x4,[x0,#-8*8] adcs x20,x20,x7 ldp x6,x7,[x1,#8*0] adcs x21,x21,x8 @@ -714,7 +1430,7 @@ //adc x28,xzr,xzr // moved below cbz x27,.Lsqr8x8_post_condition - ldr x4,[x2,#-8*8] + ldur x4,[x2,#-8*8] ldp x6,x7,[x1,#8*0] ldp x8,x9,[x1,#8*2] ldp x10,x11,[x1,#8*4] @@ -772,7 +1488,7 @@ ldp x12,x13,[x2,#8*6] cbz x27,.Lsqr8x_tail_break - ldr x4,[x0,#-8*8] + ldur x4,[x0,#-8*8] adds x19,x19,x6 adcs x20,x20,x7 ldp x6,x7,[x1,#8*0] diff --git a/sys/crypto/openssl/aarch64/chacha-armv8.S b/sys/crypto/openssl/aarch64/chacha-armv8.S --- a/sys/crypto/openssl/aarch64/chacha-armv8.S +++ b/sys/crypto/openssl/aarch64/chacha-armv8.S @@ -1,23 +1,20 @@ /* Do not modify. This file is auto-generated from chacha-armv8.pl. */ -#include "arm_arch.h" - -.text - +#ifndef __KERNEL__ +# include "arm_arch.h" .hidden OPENSSL_armcap_P +#endif + +.text .align 5 .Lsigma: .quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral .Lone: -.long 1,0,0,0 -.LOPENSSL_armcap_P: -#ifdef __ILP32__ -.long OPENSSL_armcap_P-. -#else -.quad OPENSSL_armcap_P-. -#endif -.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.long 1,2,3,4 +.Lrot24: +.long 0x02010003,0x06050407,0x0a09080b,0x0e0d0c0f +.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0 .align 2 .globl ChaCha20_ctr32 @@ -25,17 +22,15 @@ .align 5 ChaCha20_ctr32: cbz x2,.Labort - adr x5,.LOPENSSL_armcap_P cmp x2,#192 b.lo .Lshort -#ifdef __ILP32__ - ldrsw x6,[x5] -#else - ldr x6,[x5] -#endif - ldr w17,[x6,x5] + +#ifndef __KERNEL__ + adrp x17,OPENSSL_armcap_P + ldr w17,[x17,#:lo12:OPENSSL_armcap_P] tst w17,#ARMV7_NEON - b.ne ChaCha20_neon + b.ne .LChaCha20_neon +#endif .Lshort: .inst 0xd503233f // paciasp @@ -54,7 +49,7 @@ ldp x24,x25,[x3] // load key ldp x26,x27,[x3,#16] ldp x28,x30,[x4] // load counter -#ifdef __ARMEB__ +#ifdef __AARCH64EB__ ror x24,x24,#32 ror x25,x25,#32 ror x26,x26,#32 @@ -215,7 +210,7 @@ add x20,x20,x21,lsl#32 ldp x19,x21,[x1,#48] add x1,x1,#64 -#ifdef __ARMEB__ +#ifdef __AARCH64EB__ rev x5,x5 rev x7,x7 rev x9,x9 @@ -272,7 +267,7 @@ add x15,x15,x16,lsl#32 add x17,x17,x19,lsl#32 add x20,x20,x21,lsl#32 -#ifdef __ARMEB__ +#ifdef __AARCH64EB__ rev x5,x5 rev x7,x7 rev x9,x9 @@ -311,9 +306,13 @@ ret .size ChaCha20_ctr32,.-ChaCha20_ctr32 +#ifdef __KERNEL__ +.globl ChaCha20_neon +#endif .type ChaCha20_neon,%function .align 5 ChaCha20_neon: +.LChaCha20_neon: .inst 0xd503233f // paciasp stp x29,x30,[sp,#-96]! add x29,sp,#0 @@ -330,15 +329,16 @@ sub sp,sp,#64 ldp x22,x23,[x5] // load sigma - ld1 {v24.4s},[x5],#16 + ld1 {v0.4s},[x5],#16 ldp x24,x25,[x3] // load key ldp x26,x27,[x3,#16] - ld1 {v25.4s,v26.4s},[x3] + ld1 {v1.4s,v2.4s},[x3] ldp x28,x30,[x4] // load counter - ld1 {v27.4s},[x4] - ld1 {v31.4s},[x5] -#ifdef __ARMEB__ - rev64 v24.4s,v24.4s + ld1 {v3.4s},[x4] + stp d8,d9,[sp] // meet ABI requirements + ld1 {v8.4s,v9.4s},[x5] +#ifdef __AARCH64EB__ + rev64 v0.4s,v0.4s ror x24,x24,#32 ror x25,x25,#32 ror x26,x26,#32 @@ -346,296 +346,330 @@ ror x28,x28,#32 ror x30,x30,#32 #endif - add v27.4s,v27.4s,v31.4s // += 1 - add v28.4s,v27.4s,v31.4s - add v29.4s,v28.4s,v31.4s - shl v31.4s,v31.4s,#2 // 1 -> 4 .Loop_outer_neon: - mov w5,w22 // unpack key block + dup v16.4s,v0.s[0] // unpack key block + mov w5,w22 + dup v20.4s,v0.s[1] lsr x6,x22,#32 - mov v0.16b,v24.16b + dup v24.4s,v0.s[2] mov w7,w23 + dup v28.4s,v0.s[3] lsr x8,x23,#32 - mov v4.16b,v24.16b + dup v17.4s,v1.s[0] mov w9,w24 + dup v21.4s,v1.s[1] lsr x10,x24,#32 - mov v16.16b,v24.16b + dup v25.4s,v1.s[2] mov w11,w25 - mov v1.16b,v25.16b + dup v29.4s,v1.s[3] lsr x12,x25,#32 - mov v5.16b,v25.16b + dup v19.4s,v3.s[0] mov w13,w26 - mov v17.16b,v25.16b + dup v23.4s,v3.s[1] lsr x14,x26,#32 - mov v3.16b,v27.16b + dup v27.4s,v3.s[2] mov w15,w27 - mov v7.16b,v28.16b + dup v31.4s,v3.s[3] lsr x16,x27,#32 - mov v19.16b,v29.16b + add v19.4s,v19.4s,v8.4s mov w17,w28 - mov v2.16b,v26.16b + dup v18.4s,v2.s[0] lsr x19,x28,#32 - mov v6.16b,v26.16b + dup v22.4s,v2.s[1] mov w20,w30 - mov v18.16b,v26.16b + dup v26.4s,v2.s[2] lsr x21,x30,#32 + dup v30.4s,v2.s[3] mov x4,#10 - subs x2,x2,#256 + subs x2,x2,#320 .Loop_neon: sub x4,x4,#1 - add v0.4s,v0.4s,v1.4s + add v16.4s,v16.4s,v17.4s add w5,w5,w9 - add v4.4s,v4.4s,v5.4s + add v20.4s,v20.4s,v21.4s add w6,w6,w10 - add v16.4s,v16.4s,v17.4s + add v24.4s,v24.4s,v25.4s add w7,w7,w11 - eor v3.16b,v3.16b,v0.16b + add v28.4s,v28.4s,v29.4s add w8,w8,w12 - eor v7.16b,v7.16b,v4.16b - eor w17,w17,w5 eor v19.16b,v19.16b,v16.16b + eor w17,w17,w5 + eor v23.16b,v23.16b,v20.16b eor w19,w19,w6 - rev32 v3.8h,v3.8h + eor v27.16b,v27.16b,v24.16b eor w20,w20,w7 - rev32 v7.8h,v7.8h + eor v31.16b,v31.16b,v28.16b eor w21,w21,w8 rev32 v19.8h,v19.8h ror w17,w17,#16 - add v2.4s,v2.4s,v3.4s + rev32 v23.8h,v23.8h ror w19,w19,#16 - add v6.4s,v6.4s,v7.4s + rev32 v27.8h,v27.8h ror w20,w20,#16 - add v18.4s,v18.4s,v19.4s + rev32 v31.8h,v31.8h ror w21,w21,#16 - eor v20.16b,v1.16b,v2.16b + add v18.4s,v18.4s,v19.4s add w13,w13,w17 - eor v21.16b,v5.16b,v6.16b + add v22.4s,v22.4s,v23.4s add w14,w14,w19 - eor v22.16b,v17.16b,v18.16b + add v26.4s,v26.4s,v27.4s add w15,w15,w20 - ushr v1.4s,v20.4s,#20 + add v30.4s,v30.4s,v31.4s add w16,w16,w21 - ushr v5.4s,v21.4s,#20 + eor v4.16b,v17.16b,v18.16b eor w9,w9,w13 - ushr v17.4s,v22.4s,#20 + eor v5.16b,v21.16b,v22.16b eor w10,w10,w14 - sli v1.4s,v20.4s,#12 + eor v6.16b,v25.16b,v26.16b eor w11,w11,w15 - sli v5.4s,v21.4s,#12 + eor v7.16b,v29.16b,v30.16b eor w12,w12,w16 - sli v17.4s,v22.4s,#12 + ushr v17.4s,v4.4s,#20 ror w9,w9,#20 - add v0.4s,v0.4s,v1.4s + ushr v21.4s,v5.4s,#20 ror w10,w10,#20 - add v4.4s,v4.4s,v5.4s + ushr v25.4s,v6.4s,#20 ror w11,w11,#20 - add v16.4s,v16.4s,v17.4s + ushr v29.4s,v7.4s,#20 ror w12,w12,#20 - eor v20.16b,v3.16b,v0.16b + sli v17.4s,v4.4s,#12 add w5,w5,w9 - eor v21.16b,v7.16b,v4.16b + sli v21.4s,v5.4s,#12 add w6,w6,w10 - eor v22.16b,v19.16b,v16.16b + sli v25.4s,v6.4s,#12 add w7,w7,w11 - ushr v3.4s,v20.4s,#24 + sli v29.4s,v7.4s,#12 add w8,w8,w12 - ushr v7.4s,v21.4s,#24 + add v16.4s,v16.4s,v17.4s eor w17,w17,w5 - ushr v19.4s,v22.4s,#24 + add v20.4s,v20.4s,v21.4s eor w19,w19,w6 - sli v3.4s,v20.4s,#8 + add v24.4s,v24.4s,v25.4s eor w20,w20,w7 - sli v7.4s,v21.4s,#8 + add v28.4s,v28.4s,v29.4s eor w21,w21,w8 - sli v19.4s,v22.4s,#8 + eor v4.16b,v19.16b,v16.16b ror w17,w17,#24 - add v2.4s,v2.4s,v3.4s + eor v5.16b,v23.16b,v20.16b ror w19,w19,#24 - add v6.4s,v6.4s,v7.4s + eor v6.16b,v27.16b,v24.16b ror w20,w20,#24 - add v18.4s,v18.4s,v19.4s + eor v7.16b,v31.16b,v28.16b ror w21,w21,#24 - eor v20.16b,v1.16b,v2.16b + tbl v19.16b,{v4.16b},v9.16b add w13,w13,w17 - eor v21.16b,v5.16b,v6.16b + tbl v23.16b,{v5.16b},v9.16b add w14,w14,w19 - eor v22.16b,v17.16b,v18.16b + tbl v27.16b,{v6.16b},v9.16b add w15,w15,w20 - ushr v1.4s,v20.4s,#25 + tbl v31.16b,{v7.16b},v9.16b add w16,w16,w21 - ushr v5.4s,v21.4s,#25 + add v18.4s,v18.4s,v19.4s eor w9,w9,w13 - ushr v17.4s,v22.4s,#25 + add v22.4s,v22.4s,v23.4s eor w10,w10,w14 - sli v1.4s,v20.4s,#7 + add v26.4s,v26.4s,v27.4s eor w11,w11,w15 - sli v5.4s,v21.4s,#7 + add v30.4s,v30.4s,v31.4s eor w12,w12,w16 - sli v17.4s,v22.4s,#7 + eor v4.16b,v17.16b,v18.16b ror w9,w9,#25 - ext v2.16b,v2.16b,v2.16b,#8 + eor v5.16b,v21.16b,v22.16b ror w10,w10,#25 - ext v6.16b,v6.16b,v6.16b,#8 + eor v6.16b,v25.16b,v26.16b ror w11,w11,#25 - ext v18.16b,v18.16b,v18.16b,#8 + eor v7.16b,v29.16b,v30.16b ror w12,w12,#25 - ext v3.16b,v3.16b,v3.16b,#12 - ext v7.16b,v7.16b,v7.16b,#12 - ext v19.16b,v19.16b,v19.16b,#12 - ext v1.16b,v1.16b,v1.16b,#4 - ext v5.16b,v5.16b,v5.16b,#4 - ext v17.16b,v17.16b,v17.16b,#4 - add v0.4s,v0.4s,v1.4s + ushr v17.4s,v4.4s,#25 + ushr v21.4s,v5.4s,#25 + ushr v25.4s,v6.4s,#25 + ushr v29.4s,v7.4s,#25 + sli v17.4s,v4.4s,#7 + sli v21.4s,v5.4s,#7 + sli v25.4s,v6.4s,#7 + sli v29.4s,v7.4s,#7 + add v16.4s,v16.4s,v21.4s add w5,w5,w10 - add v4.4s,v4.4s,v5.4s + add v20.4s,v20.4s,v25.4s add w6,w6,w11 - add v16.4s,v16.4s,v17.4s + add v24.4s,v24.4s,v29.4s add w7,w7,w12 - eor v3.16b,v3.16b,v0.16b + add v28.4s,v28.4s,v17.4s add w8,w8,w9 - eor v7.16b,v7.16b,v4.16b + eor v31.16b,v31.16b,v16.16b eor w21,w21,w5 - eor v19.16b,v19.16b,v16.16b + eor v19.16b,v19.16b,v20.16b eor w17,w17,w6 - rev32 v3.8h,v3.8h + eor v23.16b,v23.16b,v24.16b eor w19,w19,w7 - rev32 v7.8h,v7.8h + eor v27.16b,v27.16b,v28.16b eor w20,w20,w8 - rev32 v19.8h,v19.8h + rev32 v31.8h,v31.8h ror w21,w21,#16 - add v2.4s,v2.4s,v3.4s + rev32 v19.8h,v19.8h ror w17,w17,#16 - add v6.4s,v6.4s,v7.4s + rev32 v23.8h,v23.8h ror w19,w19,#16 - add v18.4s,v18.4s,v19.4s + rev32 v27.8h,v27.8h ror w20,w20,#16 - eor v20.16b,v1.16b,v2.16b + add v26.4s,v26.4s,v31.4s add w15,w15,w21 - eor v21.16b,v5.16b,v6.16b + add v30.4s,v30.4s,v19.4s add w16,w16,w17 - eor v22.16b,v17.16b,v18.16b + add v18.4s,v18.4s,v23.4s add w13,w13,w19 - ushr v1.4s,v20.4s,#20 + add v22.4s,v22.4s,v27.4s add w14,w14,w20 - ushr v5.4s,v21.4s,#20 + eor v4.16b,v21.16b,v26.16b eor w10,w10,w15 - ushr v17.4s,v22.4s,#20 + eor v5.16b,v25.16b,v30.16b eor w11,w11,w16 - sli v1.4s,v20.4s,#12 + eor v6.16b,v29.16b,v18.16b eor w12,w12,w13 - sli v5.4s,v21.4s,#12 + eor v7.16b,v17.16b,v22.16b eor w9,w9,w14 - sli v17.4s,v22.4s,#12 + ushr v21.4s,v4.4s,#20 ror w10,w10,#20 - add v0.4s,v0.4s,v1.4s + ushr v25.4s,v5.4s,#20 ror w11,w11,#20 - add v4.4s,v4.4s,v5.4s + ushr v29.4s,v6.4s,#20 ror w12,w12,#20 - add v16.4s,v16.4s,v17.4s + ushr v17.4s,v7.4s,#20 ror w9,w9,#20 - eor v20.16b,v3.16b,v0.16b + sli v21.4s,v4.4s,#12 add w5,w5,w10 - eor v21.16b,v7.16b,v4.16b + sli v25.4s,v5.4s,#12 add w6,w6,w11 - eor v22.16b,v19.16b,v16.16b + sli v29.4s,v6.4s,#12 add w7,w7,w12 - ushr v3.4s,v20.4s,#24 + sli v17.4s,v7.4s,#12 add w8,w8,w9 - ushr v7.4s,v21.4s,#24 + add v16.4s,v16.4s,v21.4s eor w21,w21,w5 - ushr v19.4s,v22.4s,#24 + add v20.4s,v20.4s,v25.4s eor w17,w17,w6 - sli v3.4s,v20.4s,#8 + add v24.4s,v24.4s,v29.4s eor w19,w19,w7 - sli v7.4s,v21.4s,#8 + add v28.4s,v28.4s,v17.4s eor w20,w20,w8 - sli v19.4s,v22.4s,#8 + eor v4.16b,v31.16b,v16.16b ror w21,w21,#24 - add v2.4s,v2.4s,v3.4s + eor v5.16b,v19.16b,v20.16b ror w17,w17,#24 - add v6.4s,v6.4s,v7.4s + eor v6.16b,v23.16b,v24.16b ror w19,w19,#24 - add v18.4s,v18.4s,v19.4s + eor v7.16b,v27.16b,v28.16b ror w20,w20,#24 - eor v20.16b,v1.16b,v2.16b + tbl v31.16b,{v4.16b},v9.16b add w15,w15,w21 - eor v21.16b,v5.16b,v6.16b + tbl v19.16b,{v5.16b},v9.16b add w16,w16,w17 - eor v22.16b,v17.16b,v18.16b + tbl v23.16b,{v6.16b},v9.16b add w13,w13,w19 - ushr v1.4s,v20.4s,#25 + tbl v27.16b,{v7.16b},v9.16b add w14,w14,w20 - ushr v5.4s,v21.4s,#25 + add v26.4s,v26.4s,v31.4s eor w10,w10,w15 - ushr v17.4s,v22.4s,#25 + add v30.4s,v30.4s,v19.4s eor w11,w11,w16 - sli v1.4s,v20.4s,#7 + add v18.4s,v18.4s,v23.4s eor w12,w12,w13 - sli v5.4s,v21.4s,#7 + add v22.4s,v22.4s,v27.4s eor w9,w9,w14 - sli v17.4s,v22.4s,#7 + eor v4.16b,v21.16b,v26.16b ror w10,w10,#25 - ext v2.16b,v2.16b,v2.16b,#8 + eor v5.16b,v25.16b,v30.16b ror w11,w11,#25 - ext v6.16b,v6.16b,v6.16b,#8 + eor v6.16b,v29.16b,v18.16b ror w12,w12,#25 - ext v18.16b,v18.16b,v18.16b,#8 + eor v7.16b,v17.16b,v22.16b ror w9,w9,#25 - ext v3.16b,v3.16b,v3.16b,#4 - ext v7.16b,v7.16b,v7.16b,#4 - ext v19.16b,v19.16b,v19.16b,#4 - ext v1.16b,v1.16b,v1.16b,#12 - ext v5.16b,v5.16b,v5.16b,#12 - ext v17.16b,v17.16b,v17.16b,#12 + ushr v21.4s,v4.4s,#25 + ushr v25.4s,v5.4s,#25 + ushr v29.4s,v6.4s,#25 + ushr v17.4s,v7.4s,#25 + sli v21.4s,v4.4s,#7 + sli v25.4s,v5.4s,#7 + sli v29.4s,v6.4s,#7 + sli v17.4s,v7.4s,#7 cbnz x4,.Loop_neon + add v19.4s,v19.4s,v8.4s + + zip1 v4.4s,v16.4s,v20.4s // transpose data + zip1 v5.4s,v24.4s,v28.4s + zip2 v6.4s,v16.4s,v20.4s + zip2 v7.4s,v24.4s,v28.4s + zip1 v16.2d,v4.2d,v5.2d + zip2 v20.2d,v4.2d,v5.2d + zip1 v24.2d,v6.2d,v7.2d + zip2 v28.2d,v6.2d,v7.2d + + zip1 v4.4s,v17.4s,v21.4s + zip1 v5.4s,v25.4s,v29.4s + zip2 v6.4s,v17.4s,v21.4s + zip2 v7.4s,v25.4s,v29.4s + zip1 v17.2d,v4.2d,v5.2d + zip2 v21.2d,v4.2d,v5.2d + zip1 v25.2d,v6.2d,v7.2d + zip2 v29.2d,v6.2d,v7.2d + + zip1 v4.4s,v18.4s,v22.4s add w5,w5,w22 // accumulate key block - add v0.4s,v0.4s,v24.4s + zip1 v5.4s,v26.4s,v30.4s add x6,x6,x22,lsr#32 - add v4.4s,v4.4s,v24.4s + zip2 v6.4s,v18.4s,v22.4s add w7,w7,w23 - add v16.4s,v16.4s,v24.4s + zip2 v7.4s,v26.4s,v30.4s add x8,x8,x23,lsr#32 - add v2.4s,v2.4s,v26.4s + zip1 v18.2d,v4.2d,v5.2d add w9,w9,w24 - add v6.4s,v6.4s,v26.4s + zip2 v22.2d,v4.2d,v5.2d add x10,x10,x24,lsr#32 - add v18.4s,v18.4s,v26.4s + zip1 v26.2d,v6.2d,v7.2d add w11,w11,w25 - add v3.4s,v3.4s,v27.4s + zip2 v30.2d,v6.2d,v7.2d add x12,x12,x25,lsr#32 + + zip1 v4.4s,v19.4s,v23.4s add w13,w13,w26 - add v7.4s,v7.4s,v28.4s + zip1 v5.4s,v27.4s,v31.4s add x14,x14,x26,lsr#32 + zip2 v6.4s,v19.4s,v23.4s add w15,w15,w27 - add v19.4s,v19.4s,v29.4s + zip2 v7.4s,v27.4s,v31.4s add x16,x16,x27,lsr#32 + zip1 v19.2d,v4.2d,v5.2d add w17,w17,w28 - add v1.4s,v1.4s,v25.4s + zip2 v23.2d,v4.2d,v5.2d add x19,x19,x28,lsr#32 + zip1 v27.2d,v6.2d,v7.2d add w20,w20,w30 - add v5.4s,v5.4s,v25.4s + zip2 v31.2d,v6.2d,v7.2d add x21,x21,x30,lsr#32 - add v17.4s,v17.4s,v25.4s b.lo .Ltail_neon add x5,x5,x6,lsl#32 // pack add x7,x7,x8,lsl#32 ldp x6,x8,[x1,#0] // load input + add v16.4s,v16.4s,v0.4s // accumulate key block add x9,x9,x10,lsl#32 add x11,x11,x12,lsl#32 ldp x10,x12,[x1,#16] + add v17.4s,v17.4s,v1.4s add x13,x13,x14,lsl#32 add x15,x15,x16,lsl#32 ldp x14,x16,[x1,#32] + add v18.4s,v18.4s,v2.4s add x17,x17,x19,lsl#32 add x20,x20,x21,lsl#32 ldp x19,x21,[x1,#48] + add v19.4s,v19.4s,v3.4s add x1,x1,#64 -#ifdef __ARMEB__ +#ifdef __AARCH64EB__ rev x5,x5 rev x7,x7 rev x9,x9 @@ -645,48 +679,68 @@ rev x17,x17 rev x20,x20 #endif - ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 + ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 eor x5,x5,x6 + add v20.4s,v20.4s,v0.4s eor x7,x7,x8 + add v21.4s,v21.4s,v1.4s eor x9,x9,x10 + add v22.4s,v22.4s,v2.4s eor x11,x11,x12 + add v23.4s,v23.4s,v3.4s eor x13,x13,x14 - eor v0.16b,v0.16b,v20.16b + eor v16.16b,v16.16b,v4.16b + movi v4.4s,#5 eor x15,x15,x16 - eor v1.16b,v1.16b,v21.16b + eor v17.16b,v17.16b,v5.16b eor x17,x17,x19 - eor v2.16b,v2.16b,v22.16b + eor v18.16b,v18.16b,v6.16b eor x20,x20,x21 - eor v3.16b,v3.16b,v23.16b - ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 + eor v19.16b,v19.16b,v7.16b + add v8.4s,v8.4s,v4.4s // += 5 + ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 stp x5,x7,[x0,#0] // store output - add x28,x28,#4 // increment counter + add x28,x28,#5 // increment counter stp x9,x11,[x0,#16] - add v27.4s,v27.4s,v31.4s // += 4 stp x13,x15,[x0,#32] - add v28.4s,v28.4s,v31.4s stp x17,x20,[x0,#48] - add v29.4s,v29.4s,v31.4s add x0,x0,#64 - st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 - ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 - - eor v4.16b,v4.16b,v20.16b - eor v5.16b,v5.16b,v21.16b - eor v6.16b,v6.16b,v22.16b - eor v7.16b,v7.16b,v23.16b - st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 - - eor v16.16b,v16.16b,v0.16b - eor v17.16b,v17.16b,v1.16b - eor v18.16b,v18.16b,v2.16b - eor v19.16b,v19.16b,v3.16b st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 + add v24.4s,v24.4s,v0.4s + add v25.4s,v25.4s,v1.4s + add v26.4s,v26.4s,v2.4s + add v27.4s,v27.4s,v3.4s + ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64 + + eor v20.16b,v20.16b,v4.16b + eor v21.16b,v21.16b,v5.16b + eor v22.16b,v22.16b,v6.16b + eor v23.16b,v23.16b,v7.16b + st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64 + add v28.4s,v28.4s,v0.4s + add v29.4s,v29.4s,v1.4s + add v30.4s,v30.4s,v2.4s + add v31.4s,v31.4s,v3.4s + ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 + + eor v24.16b,v24.16b,v16.16b + eor v25.16b,v25.16b,v17.16b + eor v26.16b,v26.16b,v18.16b + eor v27.16b,v27.16b,v19.16b + st1 {v24.16b,v25.16b,v26.16b,v27.16b},[x0],#64 + + eor v28.16b,v28.16b,v20.16b + eor v29.16b,v29.16b,v21.16b + eor v30.16b,v30.16b,v22.16b + eor v31.16b,v31.16b,v23.16b + st1 {v28.16b,v29.16b,v30.16b,v31.16b},[x0],#64 b.hi .Loop_outer_neon + ldp d8,d9,[sp] // meet ABI requirements + ldp x19,x20,[x29,#16] add sp,sp,#64 ldp x21,x22,[x29,#32] @@ -697,8 +751,10 @@ .inst 0xd50323bf // autiasp ret +.align 4 .Ltail_neon: - add x2,x2,#256 + add x2,x2,#320 + ldp d8,d9,[sp] // meet ABI requirements cmp x2,#64 b.lo .Less_than_64 @@ -715,7 +771,7 @@ add x20,x20,x21,lsl#32 ldp x19,x21,[x1,#48] add x1,x1,#64 -#ifdef __ARMEB__ +#ifdef __AARCH64EB__ rev x5,x5 rev x7,x7 rev x9,x9 @@ -735,48 +791,68 @@ eor x20,x20,x21 stp x5,x7,[x0,#0] // store output - add x28,x28,#4 // increment counter + add v16.4s,v16.4s,v0.4s // accumulate key block stp x9,x11,[x0,#16] + add v17.4s,v17.4s,v1.4s stp x13,x15,[x0,#32] + add v18.4s,v18.4s,v2.4s stp x17,x20,[x0,#48] + add v19.4s,v19.4s,v3.4s add x0,x0,#64 b.eq .Ldone_neon sub x2,x2,#64 cmp x2,#64 - b.lo .Less_than_128 + b.lo .Last_neon - ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 - eor v0.16b,v0.16b,v20.16b - eor v1.16b,v1.16b,v21.16b - eor v2.16b,v2.16b,v22.16b - eor v3.16b,v3.16b,v23.16b - st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 + ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 + eor v16.16b,v16.16b,v4.16b + eor v17.16b,v17.16b,v5.16b + eor v18.16b,v18.16b,v6.16b + eor v19.16b,v19.16b,v7.16b + st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 b.eq .Ldone_neon + + add v16.4s,v20.4s,v0.4s + add v17.4s,v21.4s,v1.4s sub x2,x2,#64 + add v18.4s,v22.4s,v2.4s cmp x2,#64 - b.lo .Less_than_192 + add v19.4s,v23.4s,v3.4s + b.lo .Last_neon - ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 - eor v4.16b,v4.16b,v20.16b - eor v5.16b,v5.16b,v21.16b - eor v6.16b,v6.16b,v22.16b - eor v7.16b,v7.16b,v23.16b - st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 + ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 + eor v20.16b,v16.16b,v4.16b + eor v21.16b,v17.16b,v5.16b + eor v22.16b,v18.16b,v6.16b + eor v23.16b,v19.16b,v7.16b + st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64 b.eq .Ldone_neon + + add v16.4s,v24.4s,v0.4s + add v17.4s,v25.4s,v1.4s sub x2,x2,#64 + add v18.4s,v26.4s,v2.4s + cmp x2,#64 + add v19.4s,v27.4s,v3.4s + b.lo .Last_neon - st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp] - b .Last_neon + ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 + eor v24.16b,v16.16b,v4.16b + eor v25.16b,v17.16b,v5.16b + eor v26.16b,v18.16b,v6.16b + eor v27.16b,v19.16b,v7.16b + st1 {v24.16b,v25.16b,v26.16b,v27.16b},[x0],#64 + b.eq .Ldone_neon -.Less_than_128: - st1 {v0.16b,v1.16b,v2.16b,v3.16b},[sp] - b .Last_neon -.Less_than_192: - st1 {v4.16b,v5.16b,v6.16b,v7.16b},[sp] - b .Last_neon + add v16.4s,v28.4s,v0.4s + add v17.4s,v29.4s,v1.4s + add v18.4s,v30.4s,v2.4s + add v19.4s,v31.4s,v3.4s + sub x2,x2,#64 -.align 4 .Last_neon: + st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp] + sub x0,x0,#1 add x1,x1,x2 add x0,x0,x2 @@ -824,16 +900,18 @@ .L512_or_more_neon: sub sp,sp,#128+64 + eor v7.16b,v7.16b,v7.16b ldp x22,x23,[x5] // load sigma - ld1 {v24.4s},[x5],#16 + ld1 {v0.4s},[x5],#16 ldp x24,x25,[x3] // load key ldp x26,x27,[x3,#16] - ld1 {v25.4s,v26.4s},[x3] + ld1 {v1.4s,v2.4s},[x3] ldp x28,x30,[x4] // load counter - ld1 {v27.4s},[x4] - ld1 {v31.4s},[x5] -#ifdef __ARMEB__ - rev64 v24.4s,v24.4s + ld1 {v3.4s},[x4] + ld1 {v7.s}[0],[x5] + add x3,x5,#16 // .Lrot24 +#ifdef __AARCH64EB__ + rev64 v0.4s,v0.4s ror x24,x24,#32 ror x25,x25,#32 ror x26,x26,#32 @@ -841,14 +919,14 @@ ror x28,x28,#32 ror x30,x30,#32 #endif - add v27.4s,v27.4s,v31.4s // += 1 - stp q24,q25,[sp,#0] // off-load key block, invariant part - add v27.4s,v27.4s,v31.4s // not typo - str q26,[sp,#32] - add v28.4s,v27.4s,v31.4s - add v29.4s,v28.4s,v31.4s - add v30.4s,v29.4s,v31.4s - shl v31.4s,v31.4s,#2 // 1 -> 4 + add v3.4s,v3.4s,v7.4s // += 1 + stp q0,q1,[sp,#0] // off-load key block, invariant part + add v3.4s,v3.4s,v7.4s // not typo + str q2,[sp,#32] + add v4.4s,v3.4s,v7.4s + add v5.4s,v4.4s,v7.4s + add v6.4s,v5.4s,v7.4s + shl v7.4s,v7.4s,#2 // 1 -> 4 stp d8,d9,[sp,#128+0] // meet ABI requirements stp d10,d11,[sp,#128+16] @@ -858,461 +936,450 @@ sub x2,x2,#512 // not typo .Loop_outer_512_neon: - mov v0.16b,v24.16b - mov v4.16b,v24.16b - mov v8.16b,v24.16b - mov v12.16b,v24.16b - mov v16.16b,v24.16b - mov v20.16b,v24.16b - mov v1.16b,v25.16b + mov v8.16b,v0.16b + mov v12.16b,v0.16b + mov v16.16b,v0.16b + mov v20.16b,v0.16b + mov v24.16b,v0.16b + mov v28.16b,v0.16b + mov v9.16b,v1.16b mov w5,w22 // unpack key block - mov v5.16b,v25.16b + mov v13.16b,v1.16b lsr x6,x22,#32 - mov v9.16b,v25.16b + mov v17.16b,v1.16b mov w7,w23 - mov v13.16b,v25.16b + mov v21.16b,v1.16b lsr x8,x23,#32 - mov v17.16b,v25.16b + mov v25.16b,v1.16b mov w9,w24 - mov v21.16b,v25.16b + mov v29.16b,v1.16b lsr x10,x24,#32 - mov v3.16b,v27.16b + mov v11.16b,v3.16b mov w11,w25 - mov v7.16b,v28.16b + mov v15.16b,v4.16b lsr x12,x25,#32 - mov v11.16b,v29.16b + mov v19.16b,v5.16b mov w13,w26 - mov v15.16b,v30.16b + mov v23.16b,v6.16b lsr x14,x26,#32 - mov v2.16b,v26.16b + mov v10.16b,v2.16b mov w15,w27 - mov v6.16b,v26.16b + mov v14.16b,v2.16b lsr x16,x27,#32 - add v19.4s,v3.4s,v31.4s // +4 + add v27.4s,v11.4s,v7.4s // +4 mov w17,w28 - add v23.4s,v7.4s,v31.4s // +4 + add v31.4s,v15.4s,v7.4s // +4 lsr x19,x28,#32 - mov v10.16b,v26.16b + mov v18.16b,v2.16b mov w20,w30 - mov v14.16b,v26.16b + mov v22.16b,v2.16b lsr x21,x30,#32 - mov v18.16b,v26.16b - stp q27,q28,[sp,#48] // off-load key block, variable part - mov v22.16b,v26.16b - str q29,[sp,#80] + mov v26.16b,v2.16b + stp q3,q4,[sp,#48] // off-load key block, variable part + mov v30.16b,v2.16b + stp q5,q6,[sp,#80] mov x4,#5 + ld1 {v6.4s},[x3] subs x2,x2,#512 .Loop_upper_neon: sub x4,x4,#1 - add v0.4s,v0.4s,v1.4s + add v8.4s,v8.4s,v9.4s add w5,w5,w9 - add v4.4s,v4.4s,v5.4s + add v12.4s,v12.4s,v13.4s add w6,w6,w10 - add v8.4s,v8.4s,v9.4s + add v16.4s,v16.4s,v17.4s add w7,w7,w11 - add v12.4s,v12.4s,v13.4s + add v20.4s,v20.4s,v21.4s add w8,w8,w12 - add v16.4s,v16.4s,v17.4s + add v24.4s,v24.4s,v25.4s eor w17,w17,w5 - add v20.4s,v20.4s,v21.4s + add v28.4s,v28.4s,v29.4s eor w19,w19,w6 - eor v3.16b,v3.16b,v0.16b + eor v11.16b,v11.16b,v8.16b eor w20,w20,w7 - eor v7.16b,v7.16b,v4.16b + eor v15.16b,v15.16b,v12.16b eor w21,w21,w8 - eor v11.16b,v11.16b,v8.16b + eor v19.16b,v19.16b,v16.16b ror w17,w17,#16 - eor v15.16b,v15.16b,v12.16b + eor v23.16b,v23.16b,v20.16b ror w19,w19,#16 - eor v19.16b,v19.16b,v16.16b + eor v27.16b,v27.16b,v24.16b ror w20,w20,#16 - eor v23.16b,v23.16b,v20.16b + eor v31.16b,v31.16b,v28.16b ror w21,w21,#16 - rev32 v3.8h,v3.8h - add w13,w13,w17 - rev32 v7.8h,v7.8h - add w14,w14,w19 rev32 v11.8h,v11.8h - add w15,w15,w20 + add w13,w13,w17 rev32 v15.8h,v15.8h - add w16,w16,w21 + add w14,w14,w19 rev32 v19.8h,v19.8h - eor w9,w9,w13 + add w15,w15,w20 rev32 v23.8h,v23.8h + add w16,w16,w21 + rev32 v27.8h,v27.8h + eor w9,w9,w13 + rev32 v31.8h,v31.8h eor w10,w10,w14 - add v2.4s,v2.4s,v3.4s + add v10.4s,v10.4s,v11.4s eor w11,w11,w15 - add v6.4s,v6.4s,v7.4s + add v14.4s,v14.4s,v15.4s eor w12,w12,w16 - add v10.4s,v10.4s,v11.4s + add v18.4s,v18.4s,v19.4s ror w9,w9,#20 - add v14.4s,v14.4s,v15.4s + add v22.4s,v22.4s,v23.4s ror w10,w10,#20 - add v18.4s,v18.4s,v19.4s + add v26.4s,v26.4s,v27.4s ror w11,w11,#20 - add v22.4s,v22.4s,v23.4s + add v30.4s,v30.4s,v31.4s ror w12,w12,#20 - eor v24.16b,v1.16b,v2.16b + eor v0.16b,v9.16b,v10.16b add w5,w5,w9 - eor v25.16b,v5.16b,v6.16b + eor v1.16b,v13.16b,v14.16b add w6,w6,w10 - eor v26.16b,v9.16b,v10.16b + eor v2.16b,v17.16b,v18.16b add w7,w7,w11 - eor v27.16b,v13.16b,v14.16b + eor v3.16b,v21.16b,v22.16b add w8,w8,w12 - eor v28.16b,v17.16b,v18.16b + eor v4.16b,v25.16b,v26.16b eor w17,w17,w5 - eor v29.16b,v21.16b,v22.16b + eor v5.16b,v29.16b,v30.16b eor w19,w19,w6 - ushr v1.4s,v24.4s,#20 + ushr v9.4s,v0.4s,#20 eor w20,w20,w7 - ushr v5.4s,v25.4s,#20 + ushr v13.4s,v1.4s,#20 eor w21,w21,w8 - ushr v9.4s,v26.4s,#20 + ushr v17.4s,v2.4s,#20 ror w17,w17,#24 - ushr v13.4s,v27.4s,#20 + ushr v21.4s,v3.4s,#20 ror w19,w19,#24 - ushr v17.4s,v28.4s,#20 + ushr v25.4s,v4.4s,#20 ror w20,w20,#24 - ushr v21.4s,v29.4s,#20 + ushr v29.4s,v5.4s,#20 ror w21,w21,#24 - sli v1.4s,v24.4s,#12 + sli v9.4s,v0.4s,#12 add w13,w13,w17 - sli v5.4s,v25.4s,#12 + sli v13.4s,v1.4s,#12 add w14,w14,w19 - sli v9.4s,v26.4s,#12 + sli v17.4s,v2.4s,#12 add w15,w15,w20 - sli v13.4s,v27.4s,#12 + sli v21.4s,v3.4s,#12 add w16,w16,w21 - sli v17.4s,v28.4s,#12 + sli v25.4s,v4.4s,#12 eor w9,w9,w13 - sli v21.4s,v29.4s,#12 + sli v29.4s,v5.4s,#12 eor w10,w10,w14 - add v0.4s,v0.4s,v1.4s - eor w11,w11,w15 - add v4.4s,v4.4s,v5.4s - eor w12,w12,w16 add v8.4s,v8.4s,v9.4s - ror w9,w9,#25 + eor w11,w11,w15 add v12.4s,v12.4s,v13.4s - ror w10,w10,#25 + eor w12,w12,w16 add v16.4s,v16.4s,v17.4s - ror w11,w11,#25 + ror w9,w9,#25 add v20.4s,v20.4s,v21.4s + ror w10,w10,#25 + add v24.4s,v24.4s,v25.4s + ror w11,w11,#25 + add v28.4s,v28.4s,v29.4s ror w12,w12,#25 - eor v24.16b,v3.16b,v0.16b + eor v11.16b,v11.16b,v8.16b add w5,w5,w10 - eor v25.16b,v7.16b,v4.16b + eor v15.16b,v15.16b,v12.16b add w6,w6,w11 - eor v26.16b,v11.16b,v8.16b + eor v19.16b,v19.16b,v16.16b add w7,w7,w12 - eor v27.16b,v15.16b,v12.16b + eor v23.16b,v23.16b,v20.16b add w8,w8,w9 - eor v28.16b,v19.16b,v16.16b + eor v27.16b,v27.16b,v24.16b eor w21,w21,w5 - eor v29.16b,v23.16b,v20.16b + eor v31.16b,v31.16b,v28.16b eor w17,w17,w6 - ushr v3.4s,v24.4s,#24 + tbl v11.16b,{v11.16b},v6.16b eor w19,w19,w7 - ushr v7.4s,v25.4s,#24 + tbl v15.16b,{v15.16b},v6.16b eor w20,w20,w8 - ushr v11.4s,v26.4s,#24 + tbl v19.16b,{v19.16b},v6.16b ror w21,w21,#16 - ushr v15.4s,v27.4s,#24 + tbl v23.16b,{v23.16b},v6.16b ror w17,w17,#16 - ushr v19.4s,v28.4s,#24 + tbl v27.16b,{v27.16b},v6.16b ror w19,w19,#16 - ushr v23.4s,v29.4s,#24 + tbl v31.16b,{v31.16b},v6.16b ror w20,w20,#16 - sli v3.4s,v24.4s,#8 - add w15,w15,w21 - sli v7.4s,v25.4s,#8 - add w16,w16,w17 - sli v11.4s,v26.4s,#8 - add w13,w13,w19 - sli v15.4s,v27.4s,#8 - add w14,w14,w20 - sli v19.4s,v28.4s,#8 - eor w10,w10,w15 - sli v23.4s,v29.4s,#8 - eor w11,w11,w16 - add v2.4s,v2.4s,v3.4s - eor w12,w12,w13 - add v6.4s,v6.4s,v7.4s - eor w9,w9,w14 add v10.4s,v10.4s,v11.4s - ror w10,w10,#20 + add w15,w15,w21 add v14.4s,v14.4s,v15.4s - ror w11,w11,#20 + add w16,w16,w17 add v18.4s,v18.4s,v19.4s - ror w12,w12,#20 + add w13,w13,w19 add v22.4s,v22.4s,v23.4s + add w14,w14,w20 + add v26.4s,v26.4s,v27.4s + eor w10,w10,w15 + add v30.4s,v30.4s,v31.4s + eor w11,w11,w16 + eor v0.16b,v9.16b,v10.16b + eor w12,w12,w13 + eor v1.16b,v13.16b,v14.16b + eor w9,w9,w14 + eor v2.16b,v17.16b,v18.16b + ror w10,w10,#20 + eor v3.16b,v21.16b,v22.16b + ror w11,w11,#20 + eor v4.16b,v25.16b,v26.16b + ror w12,w12,#20 + eor v5.16b,v29.16b,v30.16b ror w9,w9,#20 - eor v24.16b,v1.16b,v2.16b + ushr v9.4s,v0.4s,#25 add w5,w5,w10 - eor v25.16b,v5.16b,v6.16b + ushr v13.4s,v1.4s,#25 add w6,w6,w11 - eor v26.16b,v9.16b,v10.16b + ushr v17.4s,v2.4s,#25 add w7,w7,w12 - eor v27.16b,v13.16b,v14.16b + ushr v21.4s,v3.4s,#25 add w8,w8,w9 - eor v28.16b,v17.16b,v18.16b + ushr v25.4s,v4.4s,#25 eor w21,w21,w5 - eor v29.16b,v21.16b,v22.16b + ushr v29.4s,v5.4s,#25 eor w17,w17,w6 - ushr v1.4s,v24.4s,#25 + sli v9.4s,v0.4s,#7 eor w19,w19,w7 - ushr v5.4s,v25.4s,#25 + sli v13.4s,v1.4s,#7 eor w20,w20,w8 - ushr v9.4s,v26.4s,#25 + sli v17.4s,v2.4s,#7 ror w21,w21,#24 - ushr v13.4s,v27.4s,#25 + sli v21.4s,v3.4s,#7 ror w17,w17,#24 - ushr v17.4s,v28.4s,#25 + sli v25.4s,v4.4s,#7 ror w19,w19,#24 - ushr v21.4s,v29.4s,#25 + sli v29.4s,v5.4s,#7 ror w20,w20,#24 - sli v1.4s,v24.4s,#7 - add w15,w15,w21 - sli v5.4s,v25.4s,#7 - add w16,w16,w17 - sli v9.4s,v26.4s,#7 - add w13,w13,w19 - sli v13.4s,v27.4s,#7 - add w14,w14,w20 - sli v17.4s,v28.4s,#7 - eor w10,w10,w15 - sli v21.4s,v29.4s,#7 - eor w11,w11,w16 - ext v2.16b,v2.16b,v2.16b,#8 - eor w12,w12,w13 - ext v6.16b,v6.16b,v6.16b,#8 - eor w9,w9,w14 ext v10.16b,v10.16b,v10.16b,#8 - ror w10,w10,#25 + add w15,w15,w21 ext v14.16b,v14.16b,v14.16b,#8 - ror w11,w11,#25 + add w16,w16,w17 ext v18.16b,v18.16b,v18.16b,#8 - ror w12,w12,#25 + add w13,w13,w19 ext v22.16b,v22.16b,v22.16b,#8 - ror w9,w9,#25 - ext v3.16b,v3.16b,v3.16b,#12 - ext v7.16b,v7.16b,v7.16b,#12 + add w14,w14,w20 + ext v26.16b,v26.16b,v26.16b,#8 + eor w10,w10,w15 + ext v30.16b,v30.16b,v30.16b,#8 + eor w11,w11,w16 ext v11.16b,v11.16b,v11.16b,#12 + eor w12,w12,w13 ext v15.16b,v15.16b,v15.16b,#12 + eor w9,w9,w14 ext v19.16b,v19.16b,v19.16b,#12 + ror w10,w10,#25 ext v23.16b,v23.16b,v23.16b,#12 - ext v1.16b,v1.16b,v1.16b,#4 - ext v5.16b,v5.16b,v5.16b,#4 + ror w11,w11,#25 + ext v27.16b,v27.16b,v27.16b,#12 + ror w12,w12,#25 + ext v31.16b,v31.16b,v31.16b,#12 + ror w9,w9,#25 ext v9.16b,v9.16b,v9.16b,#4 ext v13.16b,v13.16b,v13.16b,#4 ext v17.16b,v17.16b,v17.16b,#4 ext v21.16b,v21.16b,v21.16b,#4 - add v0.4s,v0.4s,v1.4s + ext v25.16b,v25.16b,v25.16b,#4 + ext v29.16b,v29.16b,v29.16b,#4 + add v8.4s,v8.4s,v9.4s add w5,w5,w9 - add v4.4s,v4.4s,v5.4s + add v12.4s,v12.4s,v13.4s add w6,w6,w10 - add v8.4s,v8.4s,v9.4s + add v16.4s,v16.4s,v17.4s add w7,w7,w11 - add v12.4s,v12.4s,v13.4s + add v20.4s,v20.4s,v21.4s add w8,w8,w12 - add v16.4s,v16.4s,v17.4s + add v24.4s,v24.4s,v25.4s eor w17,w17,w5 - add v20.4s,v20.4s,v21.4s + add v28.4s,v28.4s,v29.4s eor w19,w19,w6 - eor v3.16b,v3.16b,v0.16b + eor v11.16b,v11.16b,v8.16b eor w20,w20,w7 - eor v7.16b,v7.16b,v4.16b + eor v15.16b,v15.16b,v12.16b eor w21,w21,w8 - eor v11.16b,v11.16b,v8.16b + eor v19.16b,v19.16b,v16.16b ror w17,w17,#16 - eor v15.16b,v15.16b,v12.16b + eor v23.16b,v23.16b,v20.16b ror w19,w19,#16 - eor v19.16b,v19.16b,v16.16b + eor v27.16b,v27.16b,v24.16b ror w20,w20,#16 - eor v23.16b,v23.16b,v20.16b + eor v31.16b,v31.16b,v28.16b ror w21,w21,#16 - rev32 v3.8h,v3.8h - add w13,w13,w17 - rev32 v7.8h,v7.8h - add w14,w14,w19 rev32 v11.8h,v11.8h - add w15,w15,w20 + add w13,w13,w17 rev32 v15.8h,v15.8h - add w16,w16,w21 + add w14,w14,w19 rev32 v19.8h,v19.8h - eor w9,w9,w13 + add w15,w15,w20 rev32 v23.8h,v23.8h + add w16,w16,w21 + rev32 v27.8h,v27.8h + eor w9,w9,w13 + rev32 v31.8h,v31.8h eor w10,w10,w14 - add v2.4s,v2.4s,v3.4s + add v10.4s,v10.4s,v11.4s eor w11,w11,w15 - add v6.4s,v6.4s,v7.4s + add v14.4s,v14.4s,v15.4s eor w12,w12,w16 - add v10.4s,v10.4s,v11.4s + add v18.4s,v18.4s,v19.4s ror w9,w9,#20 - add v14.4s,v14.4s,v15.4s + add v22.4s,v22.4s,v23.4s ror w10,w10,#20 - add v18.4s,v18.4s,v19.4s + add v26.4s,v26.4s,v27.4s ror w11,w11,#20 - add v22.4s,v22.4s,v23.4s + add v30.4s,v30.4s,v31.4s ror w12,w12,#20 - eor v24.16b,v1.16b,v2.16b + eor v0.16b,v9.16b,v10.16b add w5,w5,w9 - eor v25.16b,v5.16b,v6.16b + eor v1.16b,v13.16b,v14.16b add w6,w6,w10 - eor v26.16b,v9.16b,v10.16b + eor v2.16b,v17.16b,v18.16b add w7,w7,w11 - eor v27.16b,v13.16b,v14.16b + eor v3.16b,v21.16b,v22.16b add w8,w8,w12 - eor v28.16b,v17.16b,v18.16b + eor v4.16b,v25.16b,v26.16b eor w17,w17,w5 - eor v29.16b,v21.16b,v22.16b + eor v5.16b,v29.16b,v30.16b eor w19,w19,w6 - ushr v1.4s,v24.4s,#20 + ushr v9.4s,v0.4s,#20 eor w20,w20,w7 - ushr v5.4s,v25.4s,#20 + ushr v13.4s,v1.4s,#20 eor w21,w21,w8 - ushr v9.4s,v26.4s,#20 + ushr v17.4s,v2.4s,#20 ror w17,w17,#24 - ushr v13.4s,v27.4s,#20 + ushr v21.4s,v3.4s,#20 ror w19,w19,#24 - ushr v17.4s,v28.4s,#20 + ushr v25.4s,v4.4s,#20 ror w20,w20,#24 - ushr v21.4s,v29.4s,#20 + ushr v29.4s,v5.4s,#20 ror w21,w21,#24 - sli v1.4s,v24.4s,#12 + sli v9.4s,v0.4s,#12 add w13,w13,w17 - sli v5.4s,v25.4s,#12 + sli v13.4s,v1.4s,#12 add w14,w14,w19 - sli v9.4s,v26.4s,#12 + sli v17.4s,v2.4s,#12 add w15,w15,w20 - sli v13.4s,v27.4s,#12 + sli v21.4s,v3.4s,#12 add w16,w16,w21 - sli v17.4s,v28.4s,#12 + sli v25.4s,v4.4s,#12 eor w9,w9,w13 - sli v21.4s,v29.4s,#12 + sli v29.4s,v5.4s,#12 eor w10,w10,w14 - add v0.4s,v0.4s,v1.4s - eor w11,w11,w15 - add v4.4s,v4.4s,v5.4s - eor w12,w12,w16 add v8.4s,v8.4s,v9.4s - ror w9,w9,#25 + eor w11,w11,w15 add v12.4s,v12.4s,v13.4s - ror w10,w10,#25 + eor w12,w12,w16 add v16.4s,v16.4s,v17.4s - ror w11,w11,#25 + ror w9,w9,#25 add v20.4s,v20.4s,v21.4s + ror w10,w10,#25 + add v24.4s,v24.4s,v25.4s + ror w11,w11,#25 + add v28.4s,v28.4s,v29.4s ror w12,w12,#25 - eor v24.16b,v3.16b,v0.16b + eor v11.16b,v11.16b,v8.16b add w5,w5,w10 - eor v25.16b,v7.16b,v4.16b + eor v15.16b,v15.16b,v12.16b add w6,w6,w11 - eor v26.16b,v11.16b,v8.16b + eor v19.16b,v19.16b,v16.16b add w7,w7,w12 - eor v27.16b,v15.16b,v12.16b + eor v23.16b,v23.16b,v20.16b add w8,w8,w9 - eor v28.16b,v19.16b,v16.16b + eor v27.16b,v27.16b,v24.16b eor w21,w21,w5 - eor v29.16b,v23.16b,v20.16b + eor v31.16b,v31.16b,v28.16b eor w17,w17,w6 - ushr v3.4s,v24.4s,#24 + tbl v11.16b,{v11.16b},v6.16b eor w19,w19,w7 - ushr v7.4s,v25.4s,#24 + tbl v15.16b,{v15.16b},v6.16b eor w20,w20,w8 - ushr v11.4s,v26.4s,#24 + tbl v19.16b,{v19.16b},v6.16b ror w21,w21,#16 - ushr v15.4s,v27.4s,#24 + tbl v23.16b,{v23.16b},v6.16b ror w17,w17,#16 - ushr v19.4s,v28.4s,#24 + tbl v27.16b,{v27.16b},v6.16b ror w19,w19,#16 - ushr v23.4s,v29.4s,#24 + tbl v31.16b,{v31.16b},v6.16b ror w20,w20,#16 - sli v3.4s,v24.4s,#8 - add w15,w15,w21 - sli v7.4s,v25.4s,#8 - add w16,w16,w17 - sli v11.4s,v26.4s,#8 - add w13,w13,w19 - sli v15.4s,v27.4s,#8 - add w14,w14,w20 - sli v19.4s,v28.4s,#8 - eor w10,w10,w15 - sli v23.4s,v29.4s,#8 - eor w11,w11,w16 - add v2.4s,v2.4s,v3.4s - eor w12,w12,w13 - add v6.4s,v6.4s,v7.4s - eor w9,w9,w14 add v10.4s,v10.4s,v11.4s - ror w10,w10,#20 + add w15,w15,w21 add v14.4s,v14.4s,v15.4s - ror w11,w11,#20 + add w16,w16,w17 add v18.4s,v18.4s,v19.4s - ror w12,w12,#20 + add w13,w13,w19 add v22.4s,v22.4s,v23.4s + add w14,w14,w20 + add v26.4s,v26.4s,v27.4s + eor w10,w10,w15 + add v30.4s,v30.4s,v31.4s + eor w11,w11,w16 + eor v0.16b,v9.16b,v10.16b + eor w12,w12,w13 + eor v1.16b,v13.16b,v14.16b + eor w9,w9,w14 + eor v2.16b,v17.16b,v18.16b + ror w10,w10,#20 + eor v3.16b,v21.16b,v22.16b + ror w11,w11,#20 + eor v4.16b,v25.16b,v26.16b + ror w12,w12,#20 + eor v5.16b,v29.16b,v30.16b ror w9,w9,#20 - eor v24.16b,v1.16b,v2.16b + ushr v9.4s,v0.4s,#25 add w5,w5,w10 - eor v25.16b,v5.16b,v6.16b + ushr v13.4s,v1.4s,#25 add w6,w6,w11 - eor v26.16b,v9.16b,v10.16b + ushr v17.4s,v2.4s,#25 add w7,w7,w12 - eor v27.16b,v13.16b,v14.16b + ushr v21.4s,v3.4s,#25 add w8,w8,w9 - eor v28.16b,v17.16b,v18.16b + ushr v25.4s,v4.4s,#25 eor w21,w21,w5 - eor v29.16b,v21.16b,v22.16b + ushr v29.4s,v5.4s,#25 eor w17,w17,w6 - ushr v1.4s,v24.4s,#25 + sli v9.4s,v0.4s,#7 eor w19,w19,w7 - ushr v5.4s,v25.4s,#25 + sli v13.4s,v1.4s,#7 eor w20,w20,w8 - ushr v9.4s,v26.4s,#25 + sli v17.4s,v2.4s,#7 ror w21,w21,#24 - ushr v13.4s,v27.4s,#25 + sli v21.4s,v3.4s,#7 ror w17,w17,#24 - ushr v17.4s,v28.4s,#25 + sli v25.4s,v4.4s,#7 ror w19,w19,#24 - ushr v21.4s,v29.4s,#25 + sli v29.4s,v5.4s,#7 ror w20,w20,#24 - sli v1.4s,v24.4s,#7 - add w15,w15,w21 - sli v5.4s,v25.4s,#7 - add w16,w16,w17 - sli v9.4s,v26.4s,#7 - add w13,w13,w19 - sli v13.4s,v27.4s,#7 - add w14,w14,w20 - sli v17.4s,v28.4s,#7 - eor w10,w10,w15 - sli v21.4s,v29.4s,#7 - eor w11,w11,w16 - ext v2.16b,v2.16b,v2.16b,#8 - eor w12,w12,w13 - ext v6.16b,v6.16b,v6.16b,#8 - eor w9,w9,w14 ext v10.16b,v10.16b,v10.16b,#8 - ror w10,w10,#25 + add w15,w15,w21 ext v14.16b,v14.16b,v14.16b,#8 - ror w11,w11,#25 + add w16,w16,w17 ext v18.16b,v18.16b,v18.16b,#8 - ror w12,w12,#25 + add w13,w13,w19 ext v22.16b,v22.16b,v22.16b,#8 - ror w9,w9,#25 - ext v3.16b,v3.16b,v3.16b,#4 - ext v7.16b,v7.16b,v7.16b,#4 + add w14,w14,w20 + ext v26.16b,v26.16b,v26.16b,#8 + eor w10,w10,w15 + ext v30.16b,v30.16b,v30.16b,#8 + eor w11,w11,w16 ext v11.16b,v11.16b,v11.16b,#4 + eor w12,w12,w13 ext v15.16b,v15.16b,v15.16b,#4 + eor w9,w9,w14 ext v19.16b,v19.16b,v19.16b,#4 + ror w10,w10,#25 ext v23.16b,v23.16b,v23.16b,#4 - ext v1.16b,v1.16b,v1.16b,#12 - ext v5.16b,v5.16b,v5.16b,#12 + ror w11,w11,#25 + ext v27.16b,v27.16b,v27.16b,#4 + ror w12,w12,#25 + ext v31.16b,v31.16b,v31.16b,#4 + ror w9,w9,#25 ext v9.16b,v9.16b,v9.16b,#12 ext v13.16b,v13.16b,v13.16b,#12 ext v17.16b,v17.16b,v17.16b,#12 ext v21.16b,v21.16b,v21.16b,#12 + ext v25.16b,v25.16b,v25.16b,#12 + ext v29.16b,v29.16b,v29.16b,#12 cbnz x4,.Loop_upper_neon add w5,w5,w22 // accumulate key block @@ -1345,7 +1412,7 @@ add x20,x20,x21,lsl#32 ldp x19,x21,[x1,#48] add x1,x1,#64 -#ifdef __ARMEB__ +#ifdef __AARCH64EB__ rev x5,x5 rev x7,x7 rev x9,x9 @@ -1390,476 +1457,465 @@ mov x4,#5 .Loop_lower_neon: sub x4,x4,#1 - add v0.4s,v0.4s,v1.4s + add v8.4s,v8.4s,v9.4s add w5,w5,w9 - add v4.4s,v4.4s,v5.4s + add v12.4s,v12.4s,v13.4s add w6,w6,w10 - add v8.4s,v8.4s,v9.4s + add v16.4s,v16.4s,v17.4s add w7,w7,w11 - add v12.4s,v12.4s,v13.4s + add v20.4s,v20.4s,v21.4s add w8,w8,w12 - add v16.4s,v16.4s,v17.4s + add v24.4s,v24.4s,v25.4s eor w17,w17,w5 - add v20.4s,v20.4s,v21.4s + add v28.4s,v28.4s,v29.4s eor w19,w19,w6 - eor v3.16b,v3.16b,v0.16b + eor v11.16b,v11.16b,v8.16b eor w20,w20,w7 - eor v7.16b,v7.16b,v4.16b + eor v15.16b,v15.16b,v12.16b eor w21,w21,w8 - eor v11.16b,v11.16b,v8.16b + eor v19.16b,v19.16b,v16.16b ror w17,w17,#16 - eor v15.16b,v15.16b,v12.16b + eor v23.16b,v23.16b,v20.16b ror w19,w19,#16 - eor v19.16b,v19.16b,v16.16b + eor v27.16b,v27.16b,v24.16b ror w20,w20,#16 - eor v23.16b,v23.16b,v20.16b + eor v31.16b,v31.16b,v28.16b ror w21,w21,#16 - rev32 v3.8h,v3.8h - add w13,w13,w17 - rev32 v7.8h,v7.8h - add w14,w14,w19 rev32 v11.8h,v11.8h - add w15,w15,w20 + add w13,w13,w17 rev32 v15.8h,v15.8h - add w16,w16,w21 + add w14,w14,w19 rev32 v19.8h,v19.8h - eor w9,w9,w13 + add w15,w15,w20 rev32 v23.8h,v23.8h + add w16,w16,w21 + rev32 v27.8h,v27.8h + eor w9,w9,w13 + rev32 v31.8h,v31.8h eor w10,w10,w14 - add v2.4s,v2.4s,v3.4s + add v10.4s,v10.4s,v11.4s eor w11,w11,w15 - add v6.4s,v6.4s,v7.4s + add v14.4s,v14.4s,v15.4s eor w12,w12,w16 - add v10.4s,v10.4s,v11.4s + add v18.4s,v18.4s,v19.4s ror w9,w9,#20 - add v14.4s,v14.4s,v15.4s + add v22.4s,v22.4s,v23.4s ror w10,w10,#20 - add v18.4s,v18.4s,v19.4s + add v26.4s,v26.4s,v27.4s ror w11,w11,#20 - add v22.4s,v22.4s,v23.4s + add v30.4s,v30.4s,v31.4s ror w12,w12,#20 - eor v24.16b,v1.16b,v2.16b + eor v0.16b,v9.16b,v10.16b add w5,w5,w9 - eor v25.16b,v5.16b,v6.16b + eor v1.16b,v13.16b,v14.16b add w6,w6,w10 - eor v26.16b,v9.16b,v10.16b + eor v2.16b,v17.16b,v18.16b add w7,w7,w11 - eor v27.16b,v13.16b,v14.16b + eor v3.16b,v21.16b,v22.16b add w8,w8,w12 - eor v28.16b,v17.16b,v18.16b + eor v4.16b,v25.16b,v26.16b eor w17,w17,w5 - eor v29.16b,v21.16b,v22.16b + eor v5.16b,v29.16b,v30.16b eor w19,w19,w6 - ushr v1.4s,v24.4s,#20 + ushr v9.4s,v0.4s,#20 eor w20,w20,w7 - ushr v5.4s,v25.4s,#20 + ushr v13.4s,v1.4s,#20 eor w21,w21,w8 - ushr v9.4s,v26.4s,#20 + ushr v17.4s,v2.4s,#20 ror w17,w17,#24 - ushr v13.4s,v27.4s,#20 + ushr v21.4s,v3.4s,#20 ror w19,w19,#24 - ushr v17.4s,v28.4s,#20 + ushr v25.4s,v4.4s,#20 ror w20,w20,#24 - ushr v21.4s,v29.4s,#20 + ushr v29.4s,v5.4s,#20 ror w21,w21,#24 - sli v1.4s,v24.4s,#12 + sli v9.4s,v0.4s,#12 add w13,w13,w17 - sli v5.4s,v25.4s,#12 + sli v13.4s,v1.4s,#12 add w14,w14,w19 - sli v9.4s,v26.4s,#12 + sli v17.4s,v2.4s,#12 add w15,w15,w20 - sli v13.4s,v27.4s,#12 + sli v21.4s,v3.4s,#12 add w16,w16,w21 - sli v17.4s,v28.4s,#12 + sli v25.4s,v4.4s,#12 eor w9,w9,w13 - sli v21.4s,v29.4s,#12 + sli v29.4s,v5.4s,#12 eor w10,w10,w14 - add v0.4s,v0.4s,v1.4s - eor w11,w11,w15 - add v4.4s,v4.4s,v5.4s - eor w12,w12,w16 add v8.4s,v8.4s,v9.4s - ror w9,w9,#25 + eor w11,w11,w15 add v12.4s,v12.4s,v13.4s - ror w10,w10,#25 + eor w12,w12,w16 add v16.4s,v16.4s,v17.4s - ror w11,w11,#25 + ror w9,w9,#25 add v20.4s,v20.4s,v21.4s + ror w10,w10,#25 + add v24.4s,v24.4s,v25.4s + ror w11,w11,#25 + add v28.4s,v28.4s,v29.4s ror w12,w12,#25 - eor v24.16b,v3.16b,v0.16b + eor v11.16b,v11.16b,v8.16b add w5,w5,w10 - eor v25.16b,v7.16b,v4.16b + eor v15.16b,v15.16b,v12.16b add w6,w6,w11 - eor v26.16b,v11.16b,v8.16b + eor v19.16b,v19.16b,v16.16b add w7,w7,w12 - eor v27.16b,v15.16b,v12.16b + eor v23.16b,v23.16b,v20.16b add w8,w8,w9 - eor v28.16b,v19.16b,v16.16b + eor v27.16b,v27.16b,v24.16b eor w21,w21,w5 - eor v29.16b,v23.16b,v20.16b + eor v31.16b,v31.16b,v28.16b eor w17,w17,w6 - ushr v3.4s,v24.4s,#24 + tbl v11.16b,{v11.16b},v6.16b eor w19,w19,w7 - ushr v7.4s,v25.4s,#24 + tbl v15.16b,{v15.16b},v6.16b eor w20,w20,w8 - ushr v11.4s,v26.4s,#24 + tbl v19.16b,{v19.16b},v6.16b ror w21,w21,#16 - ushr v15.4s,v27.4s,#24 + tbl v23.16b,{v23.16b},v6.16b ror w17,w17,#16 - ushr v19.4s,v28.4s,#24 + tbl v27.16b,{v27.16b},v6.16b ror w19,w19,#16 - ushr v23.4s,v29.4s,#24 + tbl v31.16b,{v31.16b},v6.16b ror w20,w20,#16 - sli v3.4s,v24.4s,#8 - add w15,w15,w21 - sli v7.4s,v25.4s,#8 - add w16,w16,w17 - sli v11.4s,v26.4s,#8 - add w13,w13,w19 - sli v15.4s,v27.4s,#8 - add w14,w14,w20 - sli v19.4s,v28.4s,#8 - eor w10,w10,w15 - sli v23.4s,v29.4s,#8 - eor w11,w11,w16 - add v2.4s,v2.4s,v3.4s - eor w12,w12,w13 - add v6.4s,v6.4s,v7.4s - eor w9,w9,w14 add v10.4s,v10.4s,v11.4s - ror w10,w10,#20 + add w15,w15,w21 add v14.4s,v14.4s,v15.4s - ror w11,w11,#20 + add w16,w16,w17 add v18.4s,v18.4s,v19.4s - ror w12,w12,#20 + add w13,w13,w19 add v22.4s,v22.4s,v23.4s + add w14,w14,w20 + add v26.4s,v26.4s,v27.4s + eor w10,w10,w15 + add v30.4s,v30.4s,v31.4s + eor w11,w11,w16 + eor v0.16b,v9.16b,v10.16b + eor w12,w12,w13 + eor v1.16b,v13.16b,v14.16b + eor w9,w9,w14 + eor v2.16b,v17.16b,v18.16b + ror w10,w10,#20 + eor v3.16b,v21.16b,v22.16b + ror w11,w11,#20 + eor v4.16b,v25.16b,v26.16b + ror w12,w12,#20 + eor v5.16b,v29.16b,v30.16b ror w9,w9,#20 - eor v24.16b,v1.16b,v2.16b + ushr v9.4s,v0.4s,#25 add w5,w5,w10 - eor v25.16b,v5.16b,v6.16b + ushr v13.4s,v1.4s,#25 add w6,w6,w11 - eor v26.16b,v9.16b,v10.16b + ushr v17.4s,v2.4s,#25 add w7,w7,w12 - eor v27.16b,v13.16b,v14.16b + ushr v21.4s,v3.4s,#25 add w8,w8,w9 - eor v28.16b,v17.16b,v18.16b + ushr v25.4s,v4.4s,#25 eor w21,w21,w5 - eor v29.16b,v21.16b,v22.16b + ushr v29.4s,v5.4s,#25 eor w17,w17,w6 - ushr v1.4s,v24.4s,#25 + sli v9.4s,v0.4s,#7 eor w19,w19,w7 - ushr v5.4s,v25.4s,#25 + sli v13.4s,v1.4s,#7 eor w20,w20,w8 - ushr v9.4s,v26.4s,#25 + sli v17.4s,v2.4s,#7 ror w21,w21,#24 - ushr v13.4s,v27.4s,#25 + sli v21.4s,v3.4s,#7 ror w17,w17,#24 - ushr v17.4s,v28.4s,#25 + sli v25.4s,v4.4s,#7 ror w19,w19,#24 - ushr v21.4s,v29.4s,#25 + sli v29.4s,v5.4s,#7 ror w20,w20,#24 - sli v1.4s,v24.4s,#7 - add w15,w15,w21 - sli v5.4s,v25.4s,#7 - add w16,w16,w17 - sli v9.4s,v26.4s,#7 - add w13,w13,w19 - sli v13.4s,v27.4s,#7 - add w14,w14,w20 - sli v17.4s,v28.4s,#7 - eor w10,w10,w15 - sli v21.4s,v29.4s,#7 - eor w11,w11,w16 - ext v2.16b,v2.16b,v2.16b,#8 - eor w12,w12,w13 - ext v6.16b,v6.16b,v6.16b,#8 - eor w9,w9,w14 ext v10.16b,v10.16b,v10.16b,#8 - ror w10,w10,#25 + add w15,w15,w21 ext v14.16b,v14.16b,v14.16b,#8 - ror w11,w11,#25 + add w16,w16,w17 ext v18.16b,v18.16b,v18.16b,#8 - ror w12,w12,#25 + add w13,w13,w19 ext v22.16b,v22.16b,v22.16b,#8 - ror w9,w9,#25 - ext v3.16b,v3.16b,v3.16b,#12 - ext v7.16b,v7.16b,v7.16b,#12 + add w14,w14,w20 + ext v26.16b,v26.16b,v26.16b,#8 + eor w10,w10,w15 + ext v30.16b,v30.16b,v30.16b,#8 + eor w11,w11,w16 ext v11.16b,v11.16b,v11.16b,#12 + eor w12,w12,w13 ext v15.16b,v15.16b,v15.16b,#12 + eor w9,w9,w14 ext v19.16b,v19.16b,v19.16b,#12 + ror w10,w10,#25 ext v23.16b,v23.16b,v23.16b,#12 - ext v1.16b,v1.16b,v1.16b,#4 - ext v5.16b,v5.16b,v5.16b,#4 + ror w11,w11,#25 + ext v27.16b,v27.16b,v27.16b,#12 + ror w12,w12,#25 + ext v31.16b,v31.16b,v31.16b,#12 + ror w9,w9,#25 ext v9.16b,v9.16b,v9.16b,#4 ext v13.16b,v13.16b,v13.16b,#4 ext v17.16b,v17.16b,v17.16b,#4 ext v21.16b,v21.16b,v21.16b,#4 - add v0.4s,v0.4s,v1.4s + ext v25.16b,v25.16b,v25.16b,#4 + ext v29.16b,v29.16b,v29.16b,#4 + add v8.4s,v8.4s,v9.4s add w5,w5,w9 - add v4.4s,v4.4s,v5.4s + add v12.4s,v12.4s,v13.4s add w6,w6,w10 - add v8.4s,v8.4s,v9.4s + add v16.4s,v16.4s,v17.4s add w7,w7,w11 - add v12.4s,v12.4s,v13.4s + add v20.4s,v20.4s,v21.4s add w8,w8,w12 - add v16.4s,v16.4s,v17.4s + add v24.4s,v24.4s,v25.4s eor w17,w17,w5 - add v20.4s,v20.4s,v21.4s + add v28.4s,v28.4s,v29.4s eor w19,w19,w6 - eor v3.16b,v3.16b,v0.16b + eor v11.16b,v11.16b,v8.16b eor w20,w20,w7 - eor v7.16b,v7.16b,v4.16b + eor v15.16b,v15.16b,v12.16b eor w21,w21,w8 - eor v11.16b,v11.16b,v8.16b + eor v19.16b,v19.16b,v16.16b ror w17,w17,#16 - eor v15.16b,v15.16b,v12.16b + eor v23.16b,v23.16b,v20.16b ror w19,w19,#16 - eor v19.16b,v19.16b,v16.16b + eor v27.16b,v27.16b,v24.16b ror w20,w20,#16 - eor v23.16b,v23.16b,v20.16b + eor v31.16b,v31.16b,v28.16b ror w21,w21,#16 - rev32 v3.8h,v3.8h - add w13,w13,w17 - rev32 v7.8h,v7.8h - add w14,w14,w19 rev32 v11.8h,v11.8h - add w15,w15,w20 + add w13,w13,w17 rev32 v15.8h,v15.8h - add w16,w16,w21 + add w14,w14,w19 rev32 v19.8h,v19.8h - eor w9,w9,w13 + add w15,w15,w20 rev32 v23.8h,v23.8h + add w16,w16,w21 + rev32 v27.8h,v27.8h + eor w9,w9,w13 + rev32 v31.8h,v31.8h eor w10,w10,w14 - add v2.4s,v2.4s,v3.4s + add v10.4s,v10.4s,v11.4s eor w11,w11,w15 - add v6.4s,v6.4s,v7.4s + add v14.4s,v14.4s,v15.4s eor w12,w12,w16 - add v10.4s,v10.4s,v11.4s + add v18.4s,v18.4s,v19.4s ror w9,w9,#20 - add v14.4s,v14.4s,v15.4s + add v22.4s,v22.4s,v23.4s ror w10,w10,#20 - add v18.4s,v18.4s,v19.4s + add v26.4s,v26.4s,v27.4s ror w11,w11,#20 - add v22.4s,v22.4s,v23.4s + add v30.4s,v30.4s,v31.4s ror w12,w12,#20 - eor v24.16b,v1.16b,v2.16b + eor v0.16b,v9.16b,v10.16b add w5,w5,w9 - eor v25.16b,v5.16b,v6.16b + eor v1.16b,v13.16b,v14.16b add w6,w6,w10 - eor v26.16b,v9.16b,v10.16b + eor v2.16b,v17.16b,v18.16b add w7,w7,w11 - eor v27.16b,v13.16b,v14.16b + eor v3.16b,v21.16b,v22.16b add w8,w8,w12 - eor v28.16b,v17.16b,v18.16b + eor v4.16b,v25.16b,v26.16b eor w17,w17,w5 - eor v29.16b,v21.16b,v22.16b + eor v5.16b,v29.16b,v30.16b eor w19,w19,w6 - ushr v1.4s,v24.4s,#20 + ushr v9.4s,v0.4s,#20 eor w20,w20,w7 - ushr v5.4s,v25.4s,#20 + ushr v13.4s,v1.4s,#20 eor w21,w21,w8 - ushr v9.4s,v26.4s,#20 + ushr v17.4s,v2.4s,#20 ror w17,w17,#24 - ushr v13.4s,v27.4s,#20 + ushr v21.4s,v3.4s,#20 ror w19,w19,#24 - ushr v17.4s,v28.4s,#20 + ushr v25.4s,v4.4s,#20 ror w20,w20,#24 - ushr v21.4s,v29.4s,#20 + ushr v29.4s,v5.4s,#20 ror w21,w21,#24 - sli v1.4s,v24.4s,#12 + sli v9.4s,v0.4s,#12 add w13,w13,w17 - sli v5.4s,v25.4s,#12 + sli v13.4s,v1.4s,#12 add w14,w14,w19 - sli v9.4s,v26.4s,#12 + sli v17.4s,v2.4s,#12 add w15,w15,w20 - sli v13.4s,v27.4s,#12 + sli v21.4s,v3.4s,#12 add w16,w16,w21 - sli v17.4s,v28.4s,#12 + sli v25.4s,v4.4s,#12 eor w9,w9,w13 - sli v21.4s,v29.4s,#12 + sli v29.4s,v5.4s,#12 eor w10,w10,w14 - add v0.4s,v0.4s,v1.4s - eor w11,w11,w15 - add v4.4s,v4.4s,v5.4s - eor w12,w12,w16 add v8.4s,v8.4s,v9.4s - ror w9,w9,#25 + eor w11,w11,w15 add v12.4s,v12.4s,v13.4s - ror w10,w10,#25 + eor w12,w12,w16 add v16.4s,v16.4s,v17.4s - ror w11,w11,#25 + ror w9,w9,#25 add v20.4s,v20.4s,v21.4s + ror w10,w10,#25 + add v24.4s,v24.4s,v25.4s + ror w11,w11,#25 + add v28.4s,v28.4s,v29.4s ror w12,w12,#25 - eor v24.16b,v3.16b,v0.16b + eor v11.16b,v11.16b,v8.16b add w5,w5,w10 - eor v25.16b,v7.16b,v4.16b + eor v15.16b,v15.16b,v12.16b add w6,w6,w11 - eor v26.16b,v11.16b,v8.16b + eor v19.16b,v19.16b,v16.16b add w7,w7,w12 - eor v27.16b,v15.16b,v12.16b + eor v23.16b,v23.16b,v20.16b add w8,w8,w9 - eor v28.16b,v19.16b,v16.16b + eor v27.16b,v27.16b,v24.16b eor w21,w21,w5 - eor v29.16b,v23.16b,v20.16b + eor v31.16b,v31.16b,v28.16b eor w17,w17,w6 - ushr v3.4s,v24.4s,#24 + tbl v11.16b,{v11.16b},v6.16b eor w19,w19,w7 - ushr v7.4s,v25.4s,#24 + tbl v15.16b,{v15.16b},v6.16b eor w20,w20,w8 - ushr v11.4s,v26.4s,#24 + tbl v19.16b,{v19.16b},v6.16b ror w21,w21,#16 - ushr v15.4s,v27.4s,#24 + tbl v23.16b,{v23.16b},v6.16b ror w17,w17,#16 - ushr v19.4s,v28.4s,#24 + tbl v27.16b,{v27.16b},v6.16b ror w19,w19,#16 - ushr v23.4s,v29.4s,#24 + tbl v31.16b,{v31.16b},v6.16b ror w20,w20,#16 - sli v3.4s,v24.4s,#8 - add w15,w15,w21 - sli v7.4s,v25.4s,#8 - add w16,w16,w17 - sli v11.4s,v26.4s,#8 - add w13,w13,w19 - sli v15.4s,v27.4s,#8 - add w14,w14,w20 - sli v19.4s,v28.4s,#8 - eor w10,w10,w15 - sli v23.4s,v29.4s,#8 - eor w11,w11,w16 - add v2.4s,v2.4s,v3.4s - eor w12,w12,w13 - add v6.4s,v6.4s,v7.4s - eor w9,w9,w14 add v10.4s,v10.4s,v11.4s - ror w10,w10,#20 + add w15,w15,w21 add v14.4s,v14.4s,v15.4s - ror w11,w11,#20 + add w16,w16,w17 add v18.4s,v18.4s,v19.4s - ror w12,w12,#20 + add w13,w13,w19 add v22.4s,v22.4s,v23.4s + add w14,w14,w20 + add v26.4s,v26.4s,v27.4s + eor w10,w10,w15 + add v30.4s,v30.4s,v31.4s + eor w11,w11,w16 + eor v0.16b,v9.16b,v10.16b + eor w12,w12,w13 + eor v1.16b,v13.16b,v14.16b + eor w9,w9,w14 + eor v2.16b,v17.16b,v18.16b + ror w10,w10,#20 + eor v3.16b,v21.16b,v22.16b + ror w11,w11,#20 + eor v4.16b,v25.16b,v26.16b + ror w12,w12,#20 + eor v5.16b,v29.16b,v30.16b ror w9,w9,#20 - eor v24.16b,v1.16b,v2.16b + ushr v9.4s,v0.4s,#25 add w5,w5,w10 - eor v25.16b,v5.16b,v6.16b + ushr v13.4s,v1.4s,#25 add w6,w6,w11 - eor v26.16b,v9.16b,v10.16b + ushr v17.4s,v2.4s,#25 add w7,w7,w12 - eor v27.16b,v13.16b,v14.16b + ushr v21.4s,v3.4s,#25 add w8,w8,w9 - eor v28.16b,v17.16b,v18.16b + ushr v25.4s,v4.4s,#25 eor w21,w21,w5 - eor v29.16b,v21.16b,v22.16b + ushr v29.4s,v5.4s,#25 eor w17,w17,w6 - ushr v1.4s,v24.4s,#25 + sli v9.4s,v0.4s,#7 eor w19,w19,w7 - ushr v5.4s,v25.4s,#25 + sli v13.4s,v1.4s,#7 eor w20,w20,w8 - ushr v9.4s,v26.4s,#25 + sli v17.4s,v2.4s,#7 ror w21,w21,#24 - ushr v13.4s,v27.4s,#25 + sli v21.4s,v3.4s,#7 ror w17,w17,#24 - ushr v17.4s,v28.4s,#25 + sli v25.4s,v4.4s,#7 ror w19,w19,#24 - ushr v21.4s,v29.4s,#25 + sli v29.4s,v5.4s,#7 ror w20,w20,#24 - sli v1.4s,v24.4s,#7 - add w15,w15,w21 - sli v5.4s,v25.4s,#7 - add w16,w16,w17 - sli v9.4s,v26.4s,#7 - add w13,w13,w19 - sli v13.4s,v27.4s,#7 - add w14,w14,w20 - sli v17.4s,v28.4s,#7 - eor w10,w10,w15 - sli v21.4s,v29.4s,#7 - eor w11,w11,w16 - ext v2.16b,v2.16b,v2.16b,#8 - eor w12,w12,w13 - ext v6.16b,v6.16b,v6.16b,#8 - eor w9,w9,w14 ext v10.16b,v10.16b,v10.16b,#8 - ror w10,w10,#25 + add w15,w15,w21 ext v14.16b,v14.16b,v14.16b,#8 - ror w11,w11,#25 + add w16,w16,w17 ext v18.16b,v18.16b,v18.16b,#8 - ror w12,w12,#25 + add w13,w13,w19 ext v22.16b,v22.16b,v22.16b,#8 - ror w9,w9,#25 - ext v3.16b,v3.16b,v3.16b,#4 - ext v7.16b,v7.16b,v7.16b,#4 + add w14,w14,w20 + ext v26.16b,v26.16b,v26.16b,#8 + eor w10,w10,w15 + ext v30.16b,v30.16b,v30.16b,#8 + eor w11,w11,w16 ext v11.16b,v11.16b,v11.16b,#4 + eor w12,w12,w13 ext v15.16b,v15.16b,v15.16b,#4 + eor w9,w9,w14 ext v19.16b,v19.16b,v19.16b,#4 + ror w10,w10,#25 ext v23.16b,v23.16b,v23.16b,#4 - ext v1.16b,v1.16b,v1.16b,#12 - ext v5.16b,v5.16b,v5.16b,#12 + ror w11,w11,#25 + ext v27.16b,v27.16b,v27.16b,#4 + ror w12,w12,#25 + ext v31.16b,v31.16b,v31.16b,#4 + ror w9,w9,#25 ext v9.16b,v9.16b,v9.16b,#12 ext v13.16b,v13.16b,v13.16b,#12 ext v17.16b,v17.16b,v17.16b,#12 ext v21.16b,v21.16b,v21.16b,#12 + ext v25.16b,v25.16b,v25.16b,#12 + ext v29.16b,v29.16b,v29.16b,#12 cbnz x4,.Loop_lower_neon add w5,w5,w22 // accumulate key block - ldp q24,q25,[sp,#0] + ldp q0,q1,[sp,#0] add x6,x6,x22,lsr#32 - ldp q26,q27,[sp,#32] + ldp q2,q3,[sp,#32] add w7,w7,w23 - ldp q28,q29,[sp,#64] + ldp q4,q5,[sp,#64] add x8,x8,x23,lsr#32 - add v0.4s,v0.4s,v24.4s + ldr q6,[sp,#96] + add v8.4s,v8.4s,v0.4s add w9,w9,w24 - add v4.4s,v4.4s,v24.4s + add v12.4s,v12.4s,v0.4s add x10,x10,x24,lsr#32 - add v8.4s,v8.4s,v24.4s + add v16.4s,v16.4s,v0.4s add w11,w11,w25 - add v12.4s,v12.4s,v24.4s + add v20.4s,v20.4s,v0.4s add x12,x12,x25,lsr#32 - add v16.4s,v16.4s,v24.4s + add v24.4s,v24.4s,v0.4s add w13,w13,w26 - add v20.4s,v20.4s,v24.4s + add v28.4s,v28.4s,v0.4s add x14,x14,x26,lsr#32 - add v2.4s,v2.4s,v26.4s + add v10.4s,v10.4s,v2.4s add w15,w15,w27 - add v6.4s,v6.4s,v26.4s + add v14.4s,v14.4s,v2.4s add x16,x16,x27,lsr#32 - add v10.4s,v10.4s,v26.4s + add v18.4s,v18.4s,v2.4s add w17,w17,w28 - add v14.4s,v14.4s,v26.4s + add v22.4s,v22.4s,v2.4s add x19,x19,x28,lsr#32 - add v18.4s,v18.4s,v26.4s + add v26.4s,v26.4s,v2.4s add w20,w20,w30 - add v22.4s,v22.4s,v26.4s + add v30.4s,v30.4s,v2.4s add x21,x21,x30,lsr#32 - add v19.4s,v19.4s,v31.4s // +4 + add v27.4s,v27.4s,v7.4s // +4 add x5,x5,x6,lsl#32 // pack - add v23.4s,v23.4s,v31.4s // +4 + add v31.4s,v31.4s,v7.4s // +4 add x7,x7,x8,lsl#32 - add v3.4s,v3.4s,v27.4s + add v11.4s,v11.4s,v3.4s ldp x6,x8,[x1,#0] // load input - add v7.4s,v7.4s,v28.4s + add v15.4s,v15.4s,v4.4s add x9,x9,x10,lsl#32 - add v11.4s,v11.4s,v29.4s + add v19.4s,v19.4s,v5.4s add x11,x11,x12,lsl#32 - add v15.4s,v15.4s,v30.4s + add v23.4s,v23.4s,v6.4s ldp x10,x12,[x1,#16] - add v19.4s,v19.4s,v27.4s + add v27.4s,v27.4s,v3.4s add x13,x13,x14,lsl#32 - add v23.4s,v23.4s,v28.4s + add v31.4s,v31.4s,v4.4s add x15,x15,x16,lsl#32 - add v1.4s,v1.4s,v25.4s + add v9.4s,v9.4s,v1.4s ldp x14,x16,[x1,#32] - add v5.4s,v5.4s,v25.4s + add v13.4s,v13.4s,v1.4s add x17,x17,x19,lsl#32 - add v9.4s,v9.4s,v25.4s + add v17.4s,v17.4s,v1.4s add x20,x20,x21,lsl#32 - add v13.4s,v13.4s,v25.4s + add v21.4s,v21.4s,v1.4s ldp x19,x21,[x1,#48] - add v17.4s,v17.4s,v25.4s + add v25.4s,v25.4s,v1.4s add x1,x1,#64 - add v21.4s,v21.4s,v25.4s + add v29.4s,v29.4s,v1.4s -#ifdef __ARMEB__ +#ifdef __AARCH64EB__ rev x5,x5 rev x7,x7 rev x9,x9 @@ -1869,103 +1925,104 @@ rev x17,x17 rev x20,x20 #endif - ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 + ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 eor x5,x5,x6 eor x7,x7,x8 eor x9,x9,x10 eor x11,x11,x12 eor x13,x13,x14 - eor v0.16b,v0.16b,v24.16b - eor x15,x15,x16 - eor v1.16b,v1.16b,v25.16b - eor x17,x17,x19 - eor v2.16b,v2.16b,v26.16b - eor x20,x20,x21 - eor v3.16b,v3.16b,v27.16b - ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 - - stp x5,x7,[x0,#0] // store output - add x28,x28,#7 // increment counter - stp x9,x11,[x0,#16] - stp x13,x15,[x0,#32] - stp x17,x20,[x0,#48] - add x0,x0,#64 - st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 - - ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 - eor v4.16b,v4.16b,v24.16b - eor v5.16b,v5.16b,v25.16b - eor v6.16b,v6.16b,v26.16b - eor v7.16b,v7.16b,v27.16b - st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 - - ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 eor v8.16b,v8.16b,v0.16b - ldp q24,q25,[sp,#0] + eor x15,x15,x16 eor v9.16b,v9.16b,v1.16b - ldp q26,q27,[sp,#32] + eor x17,x17,x19 eor v10.16b,v10.16b,v2.16b + eor x20,x20,x21 eor v11.16b,v11.16b,v3.16b + ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 + + stp x5,x7,[x0,#0] // store output + add x28,x28,#7 // increment counter + stp x9,x11,[x0,#16] + stp x13,x15,[x0,#32] + stp x17,x20,[x0,#48] + add x0,x0,#64 st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64 ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64 - eor v12.16b,v12.16b,v4.16b - eor v13.16b,v13.16b,v5.16b - eor v14.16b,v14.16b,v6.16b - eor v15.16b,v15.16b,v7.16b + eor v12.16b,v12.16b,v0.16b + eor v13.16b,v13.16b,v1.16b + eor v14.16b,v14.16b,v2.16b + eor v15.16b,v15.16b,v3.16b st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64 ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64 eor v16.16b,v16.16b,v8.16b + ldp q0,q1,[sp,#0] eor v17.16b,v17.16b,v9.16b + ldp q2,q3,[sp,#32] eor v18.16b,v18.16b,v10.16b eor v19.16b,v19.16b,v11.16b st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 - shl v0.4s,v31.4s,#1 // 4 -> 8 + ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64 eor v20.16b,v20.16b,v12.16b eor v21.16b,v21.16b,v13.16b eor v22.16b,v22.16b,v14.16b eor v23.16b,v23.16b,v15.16b st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64 - add v27.4s,v27.4s,v0.4s // += 8 - add v28.4s,v28.4s,v0.4s - add v29.4s,v29.4s,v0.4s - add v30.4s,v30.4s,v0.4s + ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 + eor v24.16b,v24.16b,v16.16b + eor v25.16b,v25.16b,v17.16b + eor v26.16b,v26.16b,v18.16b + eor v27.16b,v27.16b,v19.16b + st1 {v24.16b,v25.16b,v26.16b,v27.16b},[x0],#64 + + shl v8.4s,v7.4s,#1 // 4 -> 8 + eor v28.16b,v28.16b,v20.16b + eor v29.16b,v29.16b,v21.16b + eor v30.16b,v30.16b,v22.16b + eor v31.16b,v31.16b,v23.16b + st1 {v28.16b,v29.16b,v30.16b,v31.16b},[x0],#64 + + add v3.4s,v3.4s,v8.4s // += 8 + add v4.4s,v4.4s,v8.4s + add v5.4s,v5.4s,v8.4s + add v6.4s,v6.4s,v8.4s b.hs .Loop_outer_512_neon adds x2,x2,#512 - ushr v0.4s,v31.4s,#2 // 4 -> 1 + ushr v7.4s,v7.4s,#1 // 4 -> 2 - ldp d8,d9,[sp,#128+0] // meet ABI requirements - ldp d10,d11,[sp,#128+16] + ldp d10,d11,[sp,#128+16] // meet ABI requirements ldp d12,d13,[sp,#128+32] ldp d14,d15,[sp,#128+48] - stp q24,q31,[sp,#0] // wipe off-load area - stp q24,q31,[sp,#32] - stp q24,q31,[sp,#64] + stp q0,q0,[sp,#0] // wipe off-load area + stp q0,q0,[sp,#32] + stp q0,q0,[sp,#64] b.eq .Ldone_512_neon + sub x3,x3,#16 // .Lone cmp x2,#192 - sub v27.4s,v27.4s,v0.4s // -= 1 - sub v28.4s,v28.4s,v0.4s - sub v29.4s,v29.4s,v0.4s add sp,sp,#128 + sub v3.4s,v3.4s,v7.4s // -= 2 + ld1 {v8.4s,v9.4s},[x3] b.hs .Loop_outer_neon - eor v25.16b,v25.16b,v25.16b - eor v26.16b,v26.16b,v26.16b - eor v27.16b,v27.16b,v27.16b - eor v28.16b,v28.16b,v28.16b - eor v29.16b,v29.16b,v29.16b - eor v30.16b,v30.16b,v30.16b + ldp d8,d9,[sp,#0] // meet ABI requirements + eor v1.16b,v1.16b,v1.16b + eor v2.16b,v2.16b,v2.16b + eor v3.16b,v3.16b,v3.16b + eor v4.16b,v4.16b,v4.16b + eor v5.16b,v5.16b,v5.16b + eor v6.16b,v6.16b,v6.16b b .Loop_outer .Ldone_512_neon: + ldp d8,d9,[sp,#128+0] // meet ABI requirements ldp x19,x20,[x29,#16] add sp,sp,#128+64 ldp x21,x22,[x29,#32] diff --git a/sys/crypto/openssl/aarch64/ecp_nistz256-armv8.S b/sys/crypto/openssl/aarch64/ecp_nistz256-armv8.S --- a/sys/crypto/openssl/aarch64/ecp_nistz256-armv8.S +++ b/sys/crypto/openssl/aarch64/ecp_nistz256-armv8.S @@ -3823,7 +3823,7 @@ //////////////////////////////////////////////////////////////////////// // void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4], -// int rep); +// uint64_t rep); .globl ecp_nistz256_ord_sqr_mont .type ecp_nistz256_ord_sqr_mont,%function .align 4 @@ -4023,7 +4023,7 @@ ldp x4,x5,[x1] // X ldp x6,x7,[x1,#16] - str w4,[x0,#64*0-4] + stur w4,[x0,#64*0-4] lsr x4,x4,#32 str w5,[x0,#64*1-4] lsr x5,x5,#32 @@ -4039,7 +4039,7 @@ ldp x4,x5,[x1,#32] // Y ldp x6,x7,[x1,#48] - str w4,[x0,#64*0-4] + stur w4,[x0,#64*0-4] lsr x4,x4,#32 str w5,[x0,#64*1-4] lsr x5,x5,#32 @@ -4055,7 +4055,7 @@ ldp x4,x5,[x1,#64] // Z ldp x6,x7,[x1,#80] - str w4,[x0,#64*0-4] + stur w4,[x0,#64*0-4] lsr x4,x4,#32 str w5,[x0,#64*1-4] lsr x5,x5,#32 diff --git a/sys/crypto/openssl/aarch64/ghashv8-armx.S b/sys/crypto/openssl/aarch64/ghashv8-armx.S --- a/sys/crypto/openssl/aarch64/ghashv8-armx.S +++ b/sys/crypto/openssl/aarch64/ghashv8-armx.S @@ -2,6 +2,7 @@ #include "arm_arch.h" #if __ARM_MAX_ARCH__>=7 +.arch armv8-a+crypto .text .globl gcm_init_v8 .type gcm_init_v8,%function diff --git a/sys/crypto/openssl/aarch64/keccak1600-armv8.S b/sys/crypto/openssl/aarch64/keccak1600-armv8.S --- a/sys/crypto/openssl/aarch64/keccak1600-armv8.S +++ b/sys/crypto/openssl/aarch64/keccak1600-armv8.S @@ -574,22 +574,22 @@ .type KeccakF1600_ce,%function .align 5 KeccakF1600_ce: - mov x9,#12 + mov x9,#24 adr x10,iotas b .Loop_ce .align 4 .Loop_ce: ////////////////////////////////////////////////// Theta -.inst 0xce052819 //eor3 v25.16b,v0.16b,v5.16b,v10.16b -.inst 0xce062c3a //eor3 v26.16b,v1.16b,v6.16b,v11.16b -.inst 0xce07305b //eor3 v27.16b,v2.16b,v7.16b,v12.16b -.inst 0xce08347c //eor3 v28.16b,v3.16b,v8.16b,v13.16b -.inst 0xce09389d //eor3 v29.16b,v4.16b,v9.16b,v14.16b -.inst 0xce0f5339 //eor3 v25.16b,v25.16b, v15.16b,v20.16b -.inst 0xce10575a //eor3 v26.16b,v26.16b, v16.16b,v21.16b -.inst 0xce115b7b //eor3 v27.16b,v27.16b, v17.16b,v22.16b -.inst 0xce125f9c //eor3 v28.16b,v28.16b, v18.16b,v23.16b -.inst 0xce1363bd //eor3 v29.16b,v29.16b, v19.16b,v24.16b +.inst 0xce0f2a99 //eor3 v25.16b,v20.16b,v15.16b,v10.16b +.inst 0xce102eba //eor3 v26.16b,v21.16b,v16.16b,v11.16b +.inst 0xce1132db //eor3 v27.16b,v22.16b,v17.16b,v12.16b +.inst 0xce1236fc //eor3 v28.16b,v23.16b,v18.16b,v13.16b +.inst 0xce133b1d //eor3 v29.16b,v24.16b,v19.16b,v14.16b +.inst 0xce050339 //eor3 v25.16b,v25.16b, v5.16b,v0.16b +.inst 0xce06075a //eor3 v26.16b,v26.16b, v6.16b,v1.16b +.inst 0xce070b7b //eor3 v27.16b,v27.16b, v7.16b,v2.16b +.inst 0xce080f9c //eor3 v28.16b,v28.16b, v8.16b,v3.16b +.inst 0xce0913bd //eor3 v29.16b,v29.16b, v9.16b,v4.16b .inst 0xce7b8f3e //rax1 v30.16b,v25.16b,v27.16b // D[1] .inst 0xce7c8f5f //rax1 v31.16b,v26.16b,v28.16b // D[2] @@ -598,13 +598,15 @@ .inst 0xce7a8fbd //rax1 v29.16b,v29.16b,v26.16b // D[0] ////////////////////////////////////////////////// Theta+Rho+Pi -.inst 0xce9e50d9 //xar v25.16b, v6.16b,v30.16b,#64-44 // C[0]=A[0][1] +.inst 0xce9efc39 //xar v25.16b, v1.16b,v30.16b,#64-1 // C[0]=A[2][0] + +.inst 0xce9e50c1 //xar v1.16b,v6.16b,v30.16b,#64-44 .inst 0xce9cb126 //xar v6.16b,v9.16b,v28.16b,#64-20 .inst 0xce9f0ec9 //xar v9.16b,v22.16b,v31.16b,#64-61 .inst 0xce9c65d6 //xar v22.16b,v14.16b,v28.16b,#64-39 .inst 0xce9dba8e //xar v14.16b,v20.16b,v29.16b,#64-18 -.inst 0xce9f0854 //xar v20.16b,v2.16b,v31.16b,#64-62 +.inst 0xce9f085a //xar v26.16b, v2.16b,v31.16b,#64-62 // C[1]=A[4][0] .inst 0xce9f5582 //xar v2.16b,v12.16b,v31.16b,#64-43 .inst 0xce9b9dac //xar v12.16b,v13.16b,v27.16b,#64-25 @@ -614,145 +616,57 @@ .inst 0xce9c948f //xar v15.16b,v4.16b,v28.16b,#64-27 - eor v0.16b,v0.16b,v29.16b - ldr x11,[x10],#8 - -.inst 0xce9bae5a //xar v26.16b, v18.16b,v27.16b,#64-21 // C[1]=A[0][3] -.inst 0xce9fc632 //xar v18.16b,v17.16b,v31.16b,#64-15 -.inst 0xce9ed971 //xar v17.16b,v11.16b,v30.16b,#64-10 -.inst 0xce9fe8eb //xar v11.16b,v7.16b,v31.16b,#64-6 -.inst 0xce9df547 //xar v7.16b,v10.16b,v29.16b,#64-3 - -.inst 0xce9efc2a //xar v10.16b,v1.16b,v30.16b,#64-1 // * - -.inst 0xce9ccb04 //xar v4.16b,v24.16b,v28.16b,#64-14 +.inst 0xce9ccb1c //xar v28.16b, v24.16b,v28.16b,#64-14 // D[4]=A[0][4] .inst 0xce9efab8 //xar v24.16b,v21.16b,v30.16b,#64-2 -.inst 0xce9b2515 //xar v21.16b,v8.16b,v27.16b,#64-55 -.inst 0xce9e4e08 //xar v8.16b,v16.16b,v30.16b,#64-45 +.inst 0xce9b2508 //xar v8.16b,v8.16b,v27.16b,#64-55 // A[1][3]=A[4][1] +.inst 0xce9e4e04 //xar v4.16b,v16.16b,v30.16b,#64-45 // A[0][4]=A[1][3] .inst 0xce9d70b0 //xar v16.16b,v5.16b,v29.16b,#64-36 -.inst 0xce9b907b //xar v27.16b, v3.16b,v27.16b,#64-28 // C[2]=A[1][0] +.inst 0xce9b9065 //xar v5.16b,v3.16b,v27.16b,#64-28 + + eor v0.16b,v0.16b,v29.16b + +.inst 0xce9bae5b //xar v27.16b, v18.16b,v27.16b,#64-21 // D[3]=A[0][3] +.inst 0xce9fc623 //xar v3.16b,v17.16b,v31.16b,#64-15 // A[0][3]=A[3][3] +.inst 0xce9ed97e //xar v30.16b, v11.16b,v30.16b,#64-10 // D[1]=A[3][2] +.inst 0xce9fe8ff //xar v31.16b, v7.16b,v31.16b,#64-6 // D[2]=A[2][1] +.inst 0xce9df55d //xar v29.16b, v10.16b,v29.16b,#64-3 // D[0]=A[1][2] ////////////////////////////////////////////////// Chi+Iota - dup v31.2d,x11 // borrow C[6] -.inst 0xce22641c //bcax v28.16b, v0.16b,v2.16b,v25.16b // * -.inst 0xce3a0b21 //bcax v1.16b,v25.16b, v26.16b, v2.16b // * -.inst 0xce246842 //bcax v2.16b,v2.16b,v4.16b,v26.16b -.inst 0xce201343 //bcax v3.16b,v26.16b, v0.16b,v4.16b -.inst 0xce390084 //bcax v4.16b,v4.16b,v25.16b, v0.16b +.inst 0xce362354 //bcax v20.16b,v26.16b, v22.16b,v8.16b // A[1][3]=A[4][1] +.inst 0xce375915 //bcax v21.16b,v8.16b,v23.16b,v22.16b // A[1][3]=A[4][1] +.inst 0xce385ed6 //bcax v22.16b,v22.16b,v24.16b,v23.16b +.inst 0xce3a62f7 //bcax v23.16b,v23.16b,v26.16b, v24.16b +.inst 0xce286b18 //bcax v24.16b,v24.16b,v8.16b,v26.16b // A[1][3]=A[4][1] -.inst 0xce271b65 //bcax v5.16b,v27.16b, v7.16b,v6.16b // * -.inst 0xce281cd9 //bcax v25.16b, v6.16b,v8.16b,v7.16b // * -.inst 0xce2920e7 //bcax v7.16b,v7.16b,v9.16b,v8.16b -.inst 0xce3b2508 //bcax v8.16b,v8.16b,v27.16b, v9.16b -.inst 0xce266d29 //bcax v9.16b,v9.16b,v6.16b,v27.16b + ld1r {v26.2d},[x10],#8 - eor v0.16b,v28.16b,v31.16b // Iota - -.inst 0xce2c2d5a //bcax v26.16b, v10.16b,v12.16b,v11.16b // * -.inst 0xce2d317b //bcax v27.16b, v11.16b,v13.16b,v12.16b // * -.inst 0xce2e358c //bcax v12.16b,v12.16b,v14.16b,v13.16b -.inst 0xce2a39ad //bcax v13.16b,v13.16b,v10.16b,v14.16b -.inst 0xce2b29ce //bcax v14.16b,v14.16b,v11.16b,v10.16b - -.inst 0xce3141fc //bcax v28.16b, v15.16b,v17.16b,v16.16b // * -.inst 0xce32461d //bcax v29.16b, v16.16b,v18.16b,v17.16b // * -.inst 0xce334a31 //bcax v17.16b,v17.16b,v19.16b,v18.16b -.inst 0xce2f4e52 //bcax v18.16b,v18.16b,v15.16b,v19.16b +.inst 0xce330fd1 //bcax v17.16b,v30.16b, v19.16b,v3.16b // A[0][3]=A[3][3] +.inst 0xce2f4c72 //bcax v18.16b,v3.16b,v15.16b,v19.16b // A[0][3]=A[3][3] .inst 0xce303e73 //bcax v19.16b,v19.16b,v16.16b,v15.16b +.inst 0xce3e41ef //bcax v15.16b,v15.16b,v30.16b, v16.16b +.inst 0xce237a10 //bcax v16.16b,v16.16b,v3.16b,v30.16b // A[0][3]=A[3][3] -.inst 0xce36569e //bcax v30.16b, v20.16b,v22.16b,v21.16b // * -.inst 0xce375abf //bcax v31.16b, v21.16b,v23.16b,v22.16b // * -.inst 0xce385ed6 //bcax v22.16b,v22.16b,v24.16b,v23.16b -.inst 0xce3462f7 //bcax v23.16b,v23.16b,v20.16b,v24.16b -.inst 0xce355318 //bcax v24.16b,v24.16b,v21.16b,v20.16b - ////////////////////////////////////////////////// Theta -.inst 0xce056806 //eor3 v6.16b,v0.16b,v5.16b,v26.16b -.inst 0xce196c2a //eor3 v10.16b,v1.16b,v25.16b,v27.16b -.inst 0xce07304b //eor3 v11.16b,v2.16b,v7.16b,v12.16b -.inst 0xce08346f //eor3 v15.16b,v3.16b,v8.16b,v13.16b -.inst 0xce093890 //eor3 v16.16b,v4.16b,v9.16b,v14.16b -.inst 0xce1c78c6 //eor3 v6.16b,v6.16b, v28.16b,v30.16b -.inst 0xce1d7d4a //eor3 v10.16b,v10.16b, v29.16b,v31.16b -.inst 0xce11596b //eor3 v11.16b,v11.16b, v17.16b,v22.16b -.inst 0xce125def //eor3 v15.16b,v15.16b, v18.16b,v23.16b -.inst 0xce136210 //eor3 v16.16b,v16.16b, v19.16b,v24.16b - -.inst 0xce6b8cd4 //rax1 v20.16b,v6.16b,v11.16b // D[1] -.inst 0xce6f8d55 //rax1 v21.16b,v10.16b,v15.16b // D[2] -.inst 0xce708d6b //rax1 v11.16b,v11.16b,v16.16b // D[3] -.inst 0xce668def //rax1 v15.16b,v15.16b,v6.16b // D[4] -.inst 0xce6a8e10 //rax1 v16.16b,v16.16b,v10.16b // D[0] - - ////////////////////////////////////////////////// Theta+Rho+Pi -.inst 0xce945326 //xar v6.16b, v25.16b,v20.16b,#64-44 // C[0]=A[0][1] -.inst 0xce8fb139 //xar v25.16b,v9.16b,v15.16b,#64-20 -.inst 0xce950ec9 //xar v9.16b,v22.16b,v21.16b,#64-61 -.inst 0xce8f65d6 //xar v22.16b,v14.16b,v15.16b,#64-39 -.inst 0xce90bbce //xar v14.16b,v30.16b,v16.16b,#64-18 - -.inst 0xce95085e //xar v30.16b,v2.16b,v21.16b,#64-62 - -.inst 0xce955582 //xar v2.16b,v12.16b,v21.16b,#64-43 -.inst 0xce8b9dac //xar v12.16b,v13.16b,v11.16b,#64-25 -.inst 0xce8fe26d //xar v13.16b,v19.16b,v15.16b,#64-8 -.inst 0xce8b22f3 //xar v19.16b,v23.16b,v11.16b,#64-56 -.inst 0xce905f97 //xar v23.16b,v28.16b,v16.16b,#64-41 - -.inst 0xce8f949c //xar v28.16b,v4.16b,v15.16b,#64-27 - - eor v0.16b,v0.16b,v16.16b - ldr x11,[x10],#8 - -.inst 0xce8bae4a //xar v10.16b, v18.16b,v11.16b,#64-21 // C[1]=A[0][3] -.inst 0xce95c632 //xar v18.16b,v17.16b,v21.16b,#64-15 -.inst 0xce94db71 //xar v17.16b,v27.16b,v20.16b,#64-10 -.inst 0xce95e8fb //xar v27.16b,v7.16b,v21.16b,#64-6 -.inst 0xce90f747 //xar v7.16b,v26.16b,v16.16b,#64-3 - -.inst 0xce94fc3a //xar v26.16b,v1.16b,v20.16b,#64-1 // * - -.inst 0xce8fcb04 //xar v4.16b,v24.16b,v15.16b,#64-14 -.inst 0xce94fbf8 //xar v24.16b,v31.16b,v20.16b,#64-2 -.inst 0xce8b251f //xar v31.16b,v8.16b,v11.16b,#64-55 -.inst 0xce944fa8 //xar v8.16b,v29.16b,v20.16b,#64-45 -.inst 0xce9070bd //xar v29.16b,v5.16b,v16.16b,#64-36 - -.inst 0xce8b906b //xar v11.16b, v3.16b,v11.16b,#64-28 // C[2]=A[1][0] - - ////////////////////////////////////////////////// Chi+Iota - dup v21.2d,x11 // borrow C[6] -.inst 0xce22180f //bcax v15.16b, v0.16b,v2.16b,v6.16b // * -.inst 0xce2a08c1 //bcax v1.16b,v6.16b, v10.16b, v2.16b // * -.inst 0xce242842 //bcax v2.16b,v2.16b,v4.16b,v10.16b -.inst 0xce201143 //bcax v3.16b,v10.16b, v0.16b,v4.16b -.inst 0xce260084 //bcax v4.16b,v4.16b,v6.16b, v0.16b - -.inst 0xce276565 //bcax v5.16b,v11.16b, v7.16b,v25.16b // * -.inst 0xce281f26 //bcax v6.16b, v25.16b,v8.16b,v7.16b // * -.inst 0xce2920e7 //bcax v7.16b,v7.16b,v9.16b,v8.16b -.inst 0xce2b2508 //bcax v8.16b,v8.16b,v11.16b, v9.16b -.inst 0xce392d29 //bcax v9.16b,v9.16b,v25.16b,v11.16b - - eor v0.16b,v15.16b,v21.16b // Iota - -.inst 0xce2c6f4a //bcax v10.16b, v26.16b,v12.16b,v27.16b // * -.inst 0xce2d336b //bcax v11.16b, v27.16b,v13.16b,v12.16b // * +.inst 0xce2c7f2a //bcax v10.16b,v25.16b, v12.16b,v31.16b +.inst 0xce2d33eb //bcax v11.16b,v31.16b, v13.16b,v12.16b .inst 0xce2e358c //bcax v12.16b,v12.16b,v14.16b,v13.16b -.inst 0xce3a39ad //bcax v13.16b,v13.16b,v26.16b,v14.16b -.inst 0xce3b69ce //bcax v14.16b,v14.16b,v27.16b,v26.16b - -.inst 0xce31778f //bcax v15.16b, v28.16b,v17.16b,v29.16b // * -.inst 0xce3247b0 //bcax v16.16b, v29.16b,v18.16b,v17.16b // * -.inst 0xce334a31 //bcax v17.16b,v17.16b,v19.16b,v18.16b -.inst 0xce3c4e52 //bcax v18.16b,v18.16b,v28.16b,v19.16b -.inst 0xce3d7273 //bcax v19.16b,v19.16b,v29.16b,v28.16b - -.inst 0xce367fd4 //bcax v20.16b, v30.16b,v22.16b,v31.16b // * -.inst 0xce375bf5 //bcax v21.16b, v31.16b,v23.16b,v22.16b // * -.inst 0xce385ed6 //bcax v22.16b,v22.16b,v24.16b,v23.16b -.inst 0xce3e62f7 //bcax v23.16b,v23.16b,v30.16b,v24.16b -.inst 0xce3f7b18 //bcax v24.16b,v24.16b,v31.16b,v30.16b +.inst 0xce3939ad //bcax v13.16b,v13.16b,v25.16b, v14.16b +.inst 0xce3f65ce //bcax v14.16b,v14.16b,v31.16b, v25.16b + +.inst 0xce2913a7 //bcax v7.16b,v29.16b, v9.16b,v4.16b // A[0][4]=A[1][3] +.inst 0xce252488 //bcax v8.16b,v4.16b,v5.16b,v9.16b // A[0][4]=A[1][3] +.inst 0xce261529 //bcax v9.16b,v9.16b,v6.16b,v5.16b +.inst 0xce3d18a5 //bcax v5.16b,v5.16b,v29.16b, v6.16b +.inst 0xce2474c6 //bcax v6.16b,v6.16b,v4.16b,v29.16b // A[0][4]=A[1][3] + +.inst 0xce207363 //bcax v3.16b,v27.16b, v0.16b,v28.16b +.inst 0xce210384 //bcax v4.16b,v28.16b, v1.16b,v0.16b +.inst 0xce220400 //bcax v0.16b,v0.16b,v2.16b,v1.16b +.inst 0xce3b0821 //bcax v1.16b,v1.16b,v27.16b, v2.16b +.inst 0xce3c6c42 //bcax v2.16b,v2.16b,v28.16b, v27.16b + + eor v0.16b,v0.16b,v26.16b + subs x9,x9,#1 bne .Loop_ce diff --git a/sys/crypto/openssl/aarch64/poly1305-armv8.S b/sys/crypto/openssl/aarch64/poly1305-armv8.S --- a/sys/crypto/openssl/aarch64/poly1305-armv8.S +++ b/sys/crypto/openssl/aarch64/poly1305-armv8.S @@ -23,17 +23,12 @@ csel x0,xzr,x0,eq b.eq .Lno_key -#ifdef __ILP32__ - ldrsw x11,.LOPENSSL_armcap_P -#else - ldr x11,.LOPENSSL_armcap_P -#endif - adr x10,.LOPENSSL_armcap_P + adrp x17,OPENSSL_armcap_P + ldr w17,[x17,#:lo12:OPENSSL_armcap_P] ldp x7,x8,[x1] // load key mov x9,#0xfffffffc0fffffff movk x9,#0x0fff,lsl#48 - ldr w17,[x10,x11] #ifdef __ARMEB__ rev x7,x7 // flip bytes rev x8,x8 @@ -45,10 +40,10 @@ tst w17,#ARMV7_NEON - adr x12,poly1305_blocks - adr x7,poly1305_blocks_neon - adr x13,poly1305_emit - adr x8,poly1305_emit_neon + adr x12,.Lpoly1305_blocks + adr x7,.Lpoly1305_blocks_neon + adr x13,.Lpoly1305_emit + adr x8,.Lpoly1305_emit_neon csel x12,x12,x7,eq csel x13,x13,x8,eq @@ -67,6 +62,7 @@ .type poly1305_blocks,%function .align 5 poly1305_blocks: +.Lpoly1305_blocks: ands x2,x2,#-16 b.eq .Lno_data @@ -131,6 +127,7 @@ .type poly1305_emit,%function .align 5 poly1305_emit: +.Lpoly1305_emit: ldp x4,x5,[x0] // load hash base 2^64 ldr x6,[x0,#16] ldp x10,x11,[x2] // load nonce @@ -225,10 +222,11 @@ .type poly1305_blocks_neon,%function .align 5 poly1305_blocks_neon: +.Lpoly1305_blocks_neon: ldr x17,[x0,#24] cmp x2,#128 b.hs .Lblocks_neon - cbz x17,poly1305_blocks + cbz x17,.Lpoly1305_blocks .Lblocks_neon: .inst 0xd503233f // paciasp @@ -371,7 +369,7 @@ csel x16,x17,x16,lo mov x4,#1 - str x4,[x0,#-24] // set is_base2_26 + stur x4,[x0,#-24] // set is_base2_26 sub x0,x0,#48 // restore original x0 b .Ldo_neon @@ -808,6 +806,7 @@ .type poly1305_emit_neon,%function .align 5 poly1305_emit_neon: +.Lpoly1305_emit_neon: ldr x17,[x0,#24] cbz x17,poly1305_emit @@ -860,12 +859,6 @@ .align 5 .Lzeros: .long 0,0,0,0,0,0,0,0 -.LOPENSSL_armcap_P: -#ifdef __ILP32__ -.long OPENSSL_armcap_P-. -#else -.quad OPENSSL_armcap_P-. -#endif .byte 80,111,108,121,49,51,48,53,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 2 .align 2 diff --git a/sys/crypto/openssl/aarch64/sha1-armv8.S b/sys/crypto/openssl/aarch64/sha1-armv8.S --- a/sys/crypto/openssl/aarch64/sha1-armv8.S +++ b/sys/crypto/openssl/aarch64/sha1-armv8.S @@ -1,22 +1,18 @@ /* Do not modify. This file is auto-generated from sha1-armv8.pl. */ -#include "arm_arch.h" - -.text - +#ifndef __KERNEL__ +# include "arm_arch.h" .hidden OPENSSL_armcap_P +#endif + +.text + .globl sha1_block_data_order .type sha1_block_data_order,%function .align 6 sha1_block_data_order: -#ifdef __ILP32__ - ldrsw x16,.LOPENSSL_armcap_P -#else - ldr x16,.LOPENSSL_armcap_P -#endif - adr x17,.LOPENSSL_armcap_P - add x16,x16,x17 - ldr w16,[x16] + adrp x16,OPENSSL_armcap_P + ldr w16,[x16,#:lo12:OPENSSL_armcap_P] tst w16,#ARMV8_SHA1 b.ne .Lv8_entry @@ -37,7 +33,7 @@ movz w28,#0x7999 sub x2,x2,#1 movk w28,#0x5a82,lsl#16 -#ifdef __ARMEB__ +#ifdef __AARCH64EB__ ror x3,x3,#32 #else rev32 x3,x3 @@ -45,7 +41,7 @@ add w24,w24,w28 // warm it up add w24,w24,w3 lsr x4,x3,#32 - ldr x5,[x1,#-56] + ldur x5,[x1,#-56] bic w25,w23,w21 and w26,w22,w21 ror w27,w20,#27 @@ -55,7 +51,7 @@ ror w21,w21,#2 add w23,w23,w4 // future e+=X[i] add w24,w24,w25 // e+=F(b,c,d) -#ifdef __ARMEB__ +#ifdef __AARCH64EB__ ror x5,x5,#32 #else rev32 x5,x5 @@ -70,7 +66,7 @@ add w22,w22,w5 // future e+=X[i] add w23,w23,w25 // e+=F(b,c,d) lsr x6,x5,#32 - ldr x7,[x1,#-48] + ldur x7,[x1,#-48] bic w25,w21,w24 and w26,w20,w24 ror w27,w23,#27 @@ -80,7 +76,7 @@ ror w24,w24,#2 add w21,w21,w6 // future e+=X[i] add w22,w22,w25 // e+=F(b,c,d) -#ifdef __ARMEB__ +#ifdef __AARCH64EB__ ror x7,x7,#32 #else rev32 x7,x7 @@ -95,7 +91,7 @@ add w20,w20,w7 // future e+=X[i] add w21,w21,w25 // e+=F(b,c,d) lsr x8,x7,#32 - ldr x9,[x1,#-40] + ldur x9,[x1,#-40] bic w25,w24,w22 and w26,w23,w22 ror w27,w21,#27 @@ -105,7 +101,7 @@ ror w22,w22,#2 add w24,w24,w8 // future e+=X[i] add w20,w20,w25 // e+=F(b,c,d) -#ifdef __ARMEB__ +#ifdef __AARCH64EB__ ror x9,x9,#32 #else rev32 x9,x9 @@ -120,7 +116,7 @@ add w23,w23,w9 // future e+=X[i] add w24,w24,w25 // e+=F(b,c,d) lsr x10,x9,#32 - ldr x11,[x1,#-32] + ldur x11,[x1,#-32] bic w25,w22,w20 and w26,w21,w20 ror w27,w24,#27 @@ -130,7 +126,7 @@ ror w20,w20,#2 add w22,w22,w10 // future e+=X[i] add w23,w23,w25 // e+=F(b,c,d) -#ifdef __ARMEB__ +#ifdef __AARCH64EB__ ror x11,x11,#32 #else rev32 x11,x11 @@ -145,7 +141,7 @@ add w21,w21,w11 // future e+=X[i] add w22,w22,w25 // e+=F(b,c,d) lsr x12,x11,#32 - ldr x13,[x1,#-24] + ldur x13,[x1,#-24] bic w25,w20,w23 and w26,w24,w23 ror w27,w22,#27 @@ -155,7 +151,7 @@ ror w23,w23,#2 add w20,w20,w12 // future e+=X[i] add w21,w21,w25 // e+=F(b,c,d) -#ifdef __ARMEB__ +#ifdef __AARCH64EB__ ror x13,x13,#32 #else rev32 x13,x13 @@ -170,7 +166,7 @@ add w24,w24,w13 // future e+=X[i] add w20,w20,w25 // e+=F(b,c,d) lsr x14,x13,#32 - ldr x15,[x1,#-16] + ldur x15,[x1,#-16] bic w25,w23,w21 and w26,w22,w21 ror w27,w20,#27 @@ -180,7 +176,7 @@ ror w21,w21,#2 add w23,w23,w14 // future e+=X[i] add w24,w24,w25 // e+=F(b,c,d) -#ifdef __ARMEB__ +#ifdef __AARCH64EB__ ror x15,x15,#32 #else rev32 x15,x15 @@ -195,7 +191,7 @@ add w22,w22,w15 // future e+=X[i] add w23,w23,w25 // e+=F(b,c,d) lsr x16,x15,#32 - ldr x17,[x1,#-8] + ldur x17,[x1,#-8] bic w25,w21,w24 and w26,w20,w24 ror w27,w23,#27 @@ -205,7 +201,7 @@ ror w24,w24,#2 add w21,w21,w16 // future e+=X[i] add w22,w22,w25 // e+=F(b,c,d) -#ifdef __ARMEB__ +#ifdef __AARCH64EB__ ror x17,x17,#32 #else rev32 x17,x17 @@ -1211,12 +1207,6 @@ .long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 //K_20_39 .long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc //K_40_59 .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 //K_60_79 -.LOPENSSL_armcap_P: -#ifdef __ILP32__ -.long OPENSSL_armcap_P-. -#else -.quad OPENSSL_armcap_P-. -#endif .byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 2 .align 2 diff --git a/sys/crypto/openssl/aarch64/sha256-armv8.S b/sys/crypto/openssl/aarch64/sha256-armv8.S --- a/sys/crypto/openssl/aarch64/sha256-armv8.S +++ b/sys/crypto/openssl/aarch64/sha256-armv8.S @@ -1,7 +1,7 @@ /* Do not modify. This file is auto-generated from sha512-armv8.pl. */ // Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved. // -// Licensed under the OpenSSL license (the "License"). You may not use +// Licensed under the Apache License 2.0 (the "License"). You may not use // this file except in compliance with the License. You can obtain a copy // in the file LICENSE in the source distribution or at // https://www.openssl.org/source/license.html @@ -28,6 +28,7 @@ // X-Gene 20.0 (+100%) 12.8 (+300%(***)) // Mongoose 2.36 13.0 (+50%) 8.36 (+33%) // Kryo 1.92 17.4 (+30%) 11.2 (+8%) +// ThunderX2 2.54 13.2 (+40%) 8.40 (+18%) // // (*) Software SHA256 results are of lesser relevance, presented // mostly for informational purposes. @@ -53,27 +54,23 @@ // deliver much less improvement, likely *negative* on Cortex-A5x. // Which is why NEON support is limited to SHA256.] +// $output is the last argument if it looks like a file (it has an extension) +// $flavour is the first argument if it doesn't look like a file #ifndef __KERNEL__ # include "arm_arch.h" -#endif - -.text - .hidden OPENSSL_armcap_P +#endif + +.text + .globl sha256_block_data_order .type sha256_block_data_order,%function .align 6 sha256_block_data_order: #ifndef __KERNEL__ -# ifdef __ILP32__ - ldrsw x16,.LOPENSSL_armcap_P -# else - ldr x16,.LOPENSSL_armcap_P -# endif - adr x17,.LOPENSSL_armcap_P - add x16,x16,x17 - ldr w16,[x16] + adrp x16,OPENSSL_armcap_P + ldr w16,[x16,#:lo12:OPENSSL_armcap_P] tst w16,#ARMV8_SHA256 b.ne .Lv8_entry tst w16,#ARMV7_NEON @@ -1064,15 +1061,6 @@ .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 .long 0 //terminator .size .LK256,.-.LK256 -#ifndef __KERNEL__ -.align 3 -.LOPENSSL_armcap_P: -# ifdef __ILP32__ -.long OPENSSL_armcap_P-. -# else -.quad OPENSSL_armcap_P-. -# endif -#endif .byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 2 .align 2 diff --git a/sys/crypto/openssl/aarch64/sha512-armv8.S b/sys/crypto/openssl/aarch64/sha512-armv8.S --- a/sys/crypto/openssl/aarch64/sha512-armv8.S +++ b/sys/crypto/openssl/aarch64/sha512-armv8.S @@ -1,7 +1,7 @@ /* Do not modify. This file is auto-generated from sha512-armv8.pl. */ // Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved. // -// Licensed under the OpenSSL license (the "License"). You may not use +// Licensed under the Apache License 2.0 (the "License"). You may not use // this file except in compliance with the License. You can obtain a copy // in the file LICENSE in the source distribution or at // https://www.openssl.org/source/license.html @@ -28,6 +28,7 @@ // X-Gene 20.0 (+100%) 12.8 (+300%(***)) // Mongoose 2.36 13.0 (+50%) 8.36 (+33%) // Kryo 1.92 17.4 (+30%) 11.2 (+8%) +// ThunderX2 2.54 13.2 (+40%) 8.40 (+18%) // // (*) Software SHA256 results are of lesser relevance, presented // mostly for informational purposes. @@ -53,27 +54,23 @@ // deliver much less improvement, likely *negative* on Cortex-A5x. // Which is why NEON support is limited to SHA256.] +// $output is the last argument if it looks like a file (it has an extension) +// $flavour is the first argument if it doesn't look like a file #ifndef __KERNEL__ # include "arm_arch.h" -#endif - -.text - .hidden OPENSSL_armcap_P +#endif + +.text + .globl sha512_block_data_order .type sha512_block_data_order,%function .align 6 sha512_block_data_order: #ifndef __KERNEL__ -# ifdef __ILP32__ - ldrsw x16,.LOPENSSL_armcap_P -# else - ldr x16,.LOPENSSL_armcap_P -# endif - adr x17,.LOPENSSL_armcap_P - add x16,x16,x17 - ldr w16,[x16] + adrp x16,OPENSSL_armcap_P + ldr w16,[x16,#:lo12:OPENSSL_armcap_P] tst w16,#ARMV8_SHA512 b.ne .Lv8_entry #endif @@ -1086,15 +1083,6 @@ .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 .quad 0 // terminator .size .LK512,.-.LK512 -#ifndef __KERNEL__ -.align 3 -.LOPENSSL_armcap_P: -# ifdef __ILP32__ -.long OPENSSL_armcap_P-. -# else -.quad OPENSSL_armcap_P-. -# endif -#endif .byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 2 .align 2 diff --git a/sys/crypto/openssl/aarch64/vpaes-armv8.S b/sys/crypto/openssl/aarch64/vpaes-armv8.S --- a/sys/crypto/openssl/aarch64/vpaes-armv8.S +++ b/sys/crypto/openssl/aarch64/vpaes-armv8.S @@ -91,12 +91,12 @@ .align 2 .size _vpaes_consts,.-_vpaes_consts .align 6 -## -## _aes_preheat -## -## Fills register %r10 -> .aes_consts (so you can -fPIC) -## and %xmm9-%xmm15 as specified below. -## +// +// _aes_preheat +// +// Fills register %r10 -> .aes_consts (so you can -fPIC) +// and %xmm9-%xmm15 as specified below. +// .type _vpaes_encrypt_preheat,%function .align 4 _vpaes_encrypt_preheat: @@ -108,21 +108,21 @@ ret .size _vpaes_encrypt_preheat,.-_vpaes_encrypt_preheat -## -## _aes_encrypt_core -## -## AES-encrypt %xmm0. -## -## Inputs: -## %xmm0 = input -## %xmm9-%xmm15 as in _vpaes_preheat -## (%rdx) = scheduled keys -## -## Output in %xmm0 -## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax -## Preserves %xmm6 - %xmm8 so you get some local vectors -## -## +// +// _aes_encrypt_core +// +// AES-encrypt %xmm0. +// +// Inputs: +// %xmm0 = input +// %xmm9-%xmm15 as in _vpaes_preheat +// (%rdx) = scheduled keys +// +// Output in %xmm0 +// Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax +// Preserves %xmm6 - %xmm8 so you get some local vectors +// +// .type _vpaes_encrypt_core,%function .align 4 _vpaes_encrypt_core: @@ -328,11 +328,11 @@ ret .size _vpaes_decrypt_preheat,.-_vpaes_decrypt_preheat -## -## Decryption core -## -## Same API as encryption core. -## +// +// Decryption core +// +// Same API as encryption core. +// .type _vpaes_decrypt_core,%function .align 4 _vpaes_decrypt_core: @@ -577,11 +577,11 @@ tbl v1.16b, {v8.16b},v2.16b ret .size _vpaes_decrypt_2x,.-_vpaes_decrypt_2x -######################################################## -## ## -## AES key schedule ## -## ## -######################################################## +//////////////////////////////////////////////////////// +// // +// AES key schedule // +// // +//////////////////////////////////////////////////////// .type _vpaes_key_preheat,%function .align 4 _vpaes_key_preheat: @@ -637,14 +637,14 @@ b.eq .Lschedule_192 // 128: fall though -## -## .schedule_128 -## -## 128-bit specific part of key schedule. -## -## This schedule is really simple, because all its parts -## are accomplished by the subroutines. -## +// +// .schedule_128 +// +// 128-bit specific part of key schedule. +// +// This schedule is really simple, because all its parts +// are accomplished by the subroutines. +// .Lschedule_128: mov x0, #10 // mov $10, %esi @@ -655,21 +655,21 @@ bl _vpaes_schedule_mangle // write output b .Loop_schedule_128 -## -## .aes_schedule_192 -## -## 192-bit specific part of key schedule. -## -## The main body of this schedule is the same as the 128-bit -## schedule, but with more smearing. The long, high side is -## stored in %xmm7 as before, and the short, low side is in -## the high bits of %xmm6. -## -## This schedule is somewhat nastier, however, because each -## round produces 192 bits of key material, or 1.5 round keys. -## Therefore, on each cycle we do 2 rounds and produce 3 round -## keys. -## +// +// .aes_schedule_192 +// +// 192-bit specific part of key schedule. +// +// The main body of this schedule is the same as the 128-bit +// schedule, but with more smearing. The long, high side is +// stored in %xmm7 as before, and the short, low side is in +// the high bits of %xmm6. +// +// This schedule is somewhat nastier, however, because each +// round produces 192 bits of key material, or 1.5 round keys. +// Therefore, on each cycle we do 2 rounds and produce 3 round +// keys. +// .align 4 .Lschedule_192: sub x0, x0, #8 @@ -693,16 +693,16 @@ bl _vpaes_schedule_192_smear b .Loop_schedule_192 -## -## .aes_schedule_256 -## -## 256-bit specific part of key schedule. -## -## The structure here is very similar to the 128-bit -## schedule, but with an additional "low side" in -## %xmm6. The low side's rounds are the same as the -## high side's, except no rcon and no rotation. -## +// +// .aes_schedule_256 +// +// 256-bit specific part of key schedule. +// +// The structure here is very similar to the 128-bit +// schedule, but with an additional "low side" in +// %xmm6. The low side's rounds are the same as the +// high side's, except no rcon and no rotation. +// .align 4 .Lschedule_256: ld1 {v0.16b}, [x0] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) @@ -729,16 +729,16 @@ b .Loop_schedule_256 -## -## .aes_schedule_mangle_last -## -## Mangler for last round of key schedule -## Mangles %xmm0 -## when encrypting, outputs out(%xmm0) ^ 63 -## when decrypting, outputs unskew(%xmm0) -## -## Always called right before return... jumps to cleanup and exits -## +// +// .aes_schedule_mangle_last +// +// Mangler for last round of key schedule +// Mangles %xmm0 +// when encrypting, outputs out(%xmm0) ^ 63 +// when decrypting, outputs unskew(%xmm0) +// +// Always called right before return... jumps to cleanup and exits +// .align 4 .Lschedule_mangle_last: // schedule last round key from xmm0 @@ -772,20 +772,20 @@ ret .size _vpaes_schedule_core,.-_vpaes_schedule_core -## -## .aes_schedule_192_smear -## -## Smear the short, low side in the 192-bit key schedule. -## -## Inputs: -## %xmm7: high side, b a x y -## %xmm6: low side, d c 0 0 -## %xmm13: 0 -## -## Outputs: -## %xmm6: b+c+d b+c 0 0 -## %xmm0: b+c+d b+c b a -## +// +// .aes_schedule_192_smear +// +// Smear the short, low side in the 192-bit key schedule. +// +// Inputs: +// %xmm7: high side, b a x y +// %xmm6: low side, d c 0 0 +// %xmm13: 0 +// +// Outputs: +// %xmm6: b+c+d b+c 0 0 +// %xmm0: b+c+d b+c b a +// .type _vpaes_schedule_192_smear,%function .align 4 _vpaes_schedule_192_smear: @@ -801,24 +801,24 @@ ret .size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear -## -## .aes_schedule_round -## -## Runs one main round of the key schedule on %xmm0, %xmm7 -## -## Specifically, runs subbytes on the high dword of %xmm0 -## then rotates it by one byte and xors into the low dword of -## %xmm7. -## -## Adds rcon from low byte of %xmm8, then rotates %xmm8 for -## next rcon. -## -## Smears the dwords of %xmm7 by xoring the low into the -## second low, result into third, result into highest. -## -## Returns results in %xmm7 = %xmm0. -## Clobbers %xmm1-%xmm4, %r11. -## +// +// .aes_schedule_round +// +// Runs one main round of the key schedule on %xmm0, %xmm7 +// +// Specifically, runs subbytes on the high dword of %xmm0 +// then rotates it by one byte and xors into the low dword of +// %xmm7. +// +// Adds rcon from low byte of %xmm8, then rotates %xmm8 for +// next rcon. +// +// Smears the dwords of %xmm7 by xoring the low into the +// second low, result into third, result into highest. +// +// Returns results in %xmm7 = %xmm0. +// Clobbers %xmm1-%xmm4, %r11. +// .type _vpaes_schedule_round,%function .align 4 _vpaes_schedule_round: @@ -866,15 +866,15 @@ ret .size _vpaes_schedule_round,.-_vpaes_schedule_round -## -## .aes_schedule_transform -## -## Linear-transform %xmm0 according to tables at (%r11) -## -## Requires that %xmm9 = 0x0F0F... as in preheat -## Output in %xmm0 -## Clobbers %xmm1, %xmm2 -## +// +// .aes_schedule_transform +// +// Linear-transform %xmm0 according to tables at (%r11) +// +// Requires that %xmm9 = 0x0F0F... as in preheat +// Output in %xmm0 +// Clobbers %xmm1, %xmm2 +// .type _vpaes_schedule_transform,%function .align 4 _vpaes_schedule_transform: @@ -888,29 +888,29 @@ ret .size _vpaes_schedule_transform,.-_vpaes_schedule_transform -## -## .aes_schedule_mangle -## -## Mangle xmm0 from (basis-transformed) standard version -## to our version. -## -## On encrypt, -## xor with 0x63 -## multiply by circulant 0,1,1,1 -## apply shiftrows transform -## -## On decrypt, -## xor with 0x63 -## multiply by "inverse mixcolumns" circulant E,B,D,9 -## deskew -## apply shiftrows transform -## -## -## Writes out to (%rdx), and increments or decrements it -## Keeps track of round number mod 4 in %r8 -## Preserves xmm0 -## Clobbers xmm1-xmm5 -## +// +// .aes_schedule_mangle +// +// Mangle xmm0 from (basis-transformed) standard version +// to our version. +// +// On encrypt, +// xor with 0x63 +// multiply by circulant 0,1,1,1 +// apply shiftrows transform +// +// On decrypt, +// xor with 0x63 +// multiply by "inverse mixcolumns" circulant E,B,D,9 +// deskew +// apply shiftrows transform +// +// +// Writes out to (%rdx), and increments or decrements it +// Keeps track of round number mod 4 in %r8 +// Preserves xmm0 +// Clobbers xmm1-xmm5 +// .type _vpaes_schedule_mangle,%function .align 4 _vpaes_schedule_mangle: diff --git a/sys/crypto/openssl/amd64/aes-x86_64.S b/sys/crypto/openssl/amd64/aes-x86_64.S new file mode 100644 --- /dev/null +++ b/sys/crypto/openssl/amd64/aes-x86_64.S @@ -0,0 +1,2680 @@ +/* Do not modify. This file is auto-generated from aes-x86_64.pl. */ +.text +.type _x86_64_AES_encrypt,@function +.align 16 +_x86_64_AES_encrypt: +.cfi_startproc + xorl 0(%r15),%eax + xorl 4(%r15),%ebx + xorl 8(%r15),%ecx + xorl 12(%r15),%edx + + movl 240(%r15),%r13d + subl $1,%r13d + jmp .Lenc_loop +.align 16 +.Lenc_loop: + + movzbl %al,%esi + movzbl %bl,%edi + movzbl %cl,%ebp + movl 0(%r14,%rsi,8),%r10d + movl 0(%r14,%rdi,8),%r11d + movl 0(%r14,%rbp,8),%r12d + + movzbl %bh,%esi + movzbl %ch,%edi + movzbl %dl,%ebp + xorl 3(%r14,%rsi,8),%r10d + xorl 3(%r14,%rdi,8),%r11d + movl 0(%r14,%rbp,8),%r8d + + movzbl %dh,%esi + shrl $16,%ecx + movzbl %ah,%ebp + xorl 3(%r14,%rsi,8),%r12d + shrl $16,%edx + xorl 3(%r14,%rbp,8),%r8d + + shrl $16,%ebx + leaq 16(%r15),%r15 + shrl $16,%eax + + movzbl %cl,%esi + movzbl %dl,%edi + movzbl %al,%ebp + xorl 2(%r14,%rsi,8),%r10d + xorl 2(%r14,%rdi,8),%r11d + xorl 2(%r14,%rbp,8),%r12d + + movzbl %dh,%esi + movzbl %ah,%edi + movzbl %bl,%ebp + xorl 1(%r14,%rsi,8),%r10d + xorl 1(%r14,%rdi,8),%r11d + xorl 2(%r14,%rbp,8),%r8d + + movl 12(%r15),%edx + movzbl %bh,%edi + movzbl %ch,%ebp + movl 0(%r15),%eax + xorl 1(%r14,%rdi,8),%r12d + xorl 1(%r14,%rbp,8),%r8d + + movl 4(%r15),%ebx + movl 8(%r15),%ecx + xorl %r10d,%eax + xorl %r11d,%ebx + xorl %r12d,%ecx + xorl %r8d,%edx + subl $1,%r13d + jnz .Lenc_loop + movzbl %al,%esi + movzbl %bl,%edi + movzbl %cl,%ebp + movzbl 2(%r14,%rsi,8),%r10d + movzbl 2(%r14,%rdi,8),%r11d + movzbl 2(%r14,%rbp,8),%r12d + + movzbl %dl,%esi + movzbl %bh,%edi + movzbl %ch,%ebp + movzbl 2(%r14,%rsi,8),%r8d + movl 0(%r14,%rdi,8),%edi + movl 0(%r14,%rbp,8),%ebp + + andl $0x0000ff00,%edi + andl $0x0000ff00,%ebp + + xorl %edi,%r10d + xorl %ebp,%r11d + shrl $16,%ecx + + movzbl %dh,%esi + movzbl %ah,%edi + shrl $16,%edx + movl 0(%r14,%rsi,8),%esi + movl 0(%r14,%rdi,8),%edi + + andl $0x0000ff00,%esi + andl $0x0000ff00,%edi + shrl $16,%ebx + xorl %esi,%r12d + xorl %edi,%r8d + shrl $16,%eax + + movzbl %cl,%esi + movzbl %dl,%edi + movzbl %al,%ebp + movl 0(%r14,%rsi,8),%esi + movl 0(%r14,%rdi,8),%edi + movl 0(%r14,%rbp,8),%ebp + + andl $0x00ff0000,%esi + andl $0x00ff0000,%edi + andl $0x00ff0000,%ebp + + xorl %esi,%r10d + xorl %edi,%r11d + xorl %ebp,%r12d + + movzbl %bl,%esi + movzbl %dh,%edi + movzbl %ah,%ebp + movl 0(%r14,%rsi,8),%esi + movl 2(%r14,%rdi,8),%edi + movl 2(%r14,%rbp,8),%ebp + + andl $0x00ff0000,%esi + andl $0xff000000,%edi + andl $0xff000000,%ebp + + xorl %esi,%r8d + xorl %edi,%r10d + xorl %ebp,%r11d + + movzbl %bh,%esi + movzbl %ch,%edi + movl 16+12(%r15),%edx + movl 2(%r14,%rsi,8),%esi + movl 2(%r14,%rdi,8),%edi + movl 16+0(%r15),%eax + + andl $0xff000000,%esi + andl $0xff000000,%edi + + xorl %esi,%r12d + xorl %edi,%r8d + + movl 16+4(%r15),%ebx + movl 16+8(%r15),%ecx + xorl %r10d,%eax + xorl %r11d,%ebx + xorl %r12d,%ecx + xorl %r8d,%edx +.byte 0xf3,0xc3 +.cfi_endproc +.size _x86_64_AES_encrypt,.-_x86_64_AES_encrypt +.type _x86_64_AES_encrypt_compact,@function +.align 16 +_x86_64_AES_encrypt_compact: +.cfi_startproc + leaq 128(%r14),%r8 + movl 0-128(%r8),%edi + movl 32-128(%r8),%ebp + movl 64-128(%r8),%r10d + movl 96-128(%r8),%r11d + movl 128-128(%r8),%edi + movl 160-128(%r8),%ebp + movl 192-128(%r8),%r10d + movl 224-128(%r8),%r11d + jmp .Lenc_loop_compact +.align 16 +.Lenc_loop_compact: + xorl 0(%r15),%eax + xorl 4(%r15),%ebx + xorl 8(%r15),%ecx + xorl 12(%r15),%edx + leaq 16(%r15),%r15 + movzbl %al,%r10d + movzbl %bl,%r11d + movzbl %cl,%r12d + movzbl %dl,%r8d + movzbl %bh,%esi + movzbl %ch,%edi + shrl $16,%ecx + movzbl %dh,%ebp + movzbl (%r14,%r10,1),%r10d + movzbl (%r14,%r11,1),%r11d + movzbl (%r14,%r12,1),%r12d + movzbl (%r14,%r8,1),%r8d + + movzbl (%r14,%rsi,1),%r9d + movzbl %ah,%esi + movzbl (%r14,%rdi,1),%r13d + movzbl %cl,%edi + movzbl (%r14,%rbp,1),%ebp + movzbl (%r14,%rsi,1),%esi + + shll $8,%r9d + shrl $16,%edx + shll $8,%r13d + xorl %r9d,%r10d + shrl $16,%eax + movzbl %dl,%r9d + shrl $16,%ebx + xorl %r13d,%r11d + shll $8,%ebp + movzbl %al,%r13d + movzbl (%r14,%rdi,1),%edi + xorl %ebp,%r12d + + shll $8,%esi + movzbl %bl,%ebp + shll $16,%edi + xorl %esi,%r8d + movzbl (%r14,%r9,1),%r9d + movzbl %dh,%esi + movzbl (%r14,%r13,1),%r13d + xorl %edi,%r10d + + shrl $8,%ecx + movzbl %ah,%edi + shll $16,%r9d + shrl $8,%ebx + shll $16,%r13d + xorl %r9d,%r11d + movzbl (%r14,%rbp,1),%ebp + movzbl (%r14,%rsi,1),%esi + movzbl (%r14,%rdi,1),%edi + movzbl (%r14,%rcx,1),%edx + movzbl (%r14,%rbx,1),%ecx + + shll $16,%ebp + xorl %r13d,%r12d + shll $24,%esi + xorl %ebp,%r8d + shll $24,%edi + xorl %esi,%r10d + shll $24,%edx + xorl %edi,%r11d + shll $24,%ecx + movl %r10d,%eax + movl %r11d,%ebx + xorl %r12d,%ecx + xorl %r8d,%edx + cmpq 16(%rsp),%r15 + je .Lenc_compact_done + movl $0x80808080,%r10d + movl $0x80808080,%r11d + andl %eax,%r10d + andl %ebx,%r11d + movl %r10d,%esi + movl %r11d,%edi + shrl $7,%r10d + leal (%rax,%rax,1),%r8d + shrl $7,%r11d + leal (%rbx,%rbx,1),%r9d + subl %r10d,%esi + subl %r11d,%edi + andl $0xfefefefe,%r8d + andl $0xfefefefe,%r9d + andl $0x1b1b1b1b,%esi + andl $0x1b1b1b1b,%edi + movl %eax,%r10d + movl %ebx,%r11d + xorl %esi,%r8d + xorl %edi,%r9d + + xorl %r8d,%eax + xorl %r9d,%ebx + movl $0x80808080,%r12d + roll $24,%eax + movl $0x80808080,%ebp + roll $24,%ebx + andl %ecx,%r12d + andl %edx,%ebp + xorl %r8d,%eax + xorl %r9d,%ebx + movl %r12d,%esi + rorl $16,%r10d + movl %ebp,%edi + rorl $16,%r11d + leal (%rcx,%rcx,1),%r8d + shrl $7,%r12d + xorl %r10d,%eax + shrl $7,%ebp + xorl %r11d,%ebx + rorl $8,%r10d + leal (%rdx,%rdx,1),%r9d + rorl $8,%r11d + subl %r12d,%esi + subl %ebp,%edi + xorl %r10d,%eax + xorl %r11d,%ebx + + andl $0xfefefefe,%r8d + andl $0xfefefefe,%r9d + andl $0x1b1b1b1b,%esi + andl $0x1b1b1b1b,%edi + movl %ecx,%r12d + movl %edx,%ebp + xorl %esi,%r8d + xorl %edi,%r9d + + rorl $16,%r12d + xorl %r8d,%ecx + rorl $16,%ebp + xorl %r9d,%edx + roll $24,%ecx + movl 0(%r14),%esi + roll $24,%edx + xorl %r8d,%ecx + movl 64(%r14),%edi + xorl %r9d,%edx + movl 128(%r14),%r8d + xorl %r12d,%ecx + rorl $8,%r12d + xorl %ebp,%edx + rorl $8,%ebp + xorl %r12d,%ecx + movl 192(%r14),%r9d + xorl %ebp,%edx + jmp .Lenc_loop_compact +.align 16 +.Lenc_compact_done: + xorl 0(%r15),%eax + xorl 4(%r15),%ebx + xorl 8(%r15),%ecx + xorl 12(%r15),%edx +.byte 0xf3,0xc3 +.cfi_endproc +.size _x86_64_AES_encrypt_compact,.-_x86_64_AES_encrypt_compact +.globl AES_encrypt +.type AES_encrypt,@function +.align 16 +.globl asm_AES_encrypt +.hidden asm_AES_encrypt +asm_AES_encrypt: +AES_encrypt: +.cfi_startproc +.byte 243,15,30,250 + movq %rsp,%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + + + leaq -63(%rdx),%rcx + andq $-64,%rsp + subq %rsp,%rcx + negq %rcx + andq $0x3c0,%rcx + subq %rcx,%rsp + subq $32,%rsp + + movq %rsi,16(%rsp) + movq %rax,24(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x18,0x06,0x23,0x08 +.Lenc_prologue: + + movq %rdx,%r15 + movl 240(%r15),%r13d + + movl 0(%rdi),%eax + movl 4(%rdi),%ebx + movl 8(%rdi),%ecx + movl 12(%rdi),%edx + + shll $4,%r13d + leaq (%r15,%r13,1),%rbp + movq %r15,(%rsp) + movq %rbp,8(%rsp) + + + leaq .LAES_Te+2048(%rip),%r14 + leaq 768(%rsp),%rbp + subq %r14,%rbp + andq $0x300,%rbp + leaq (%r14,%rbp,1),%r14 + + call _x86_64_AES_encrypt_compact + + movq 16(%rsp),%r9 + movq 24(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movl %eax,0(%r9) + movl %ebx,4(%r9) + movl %ecx,8(%r9) + movl %edx,12(%r9) + + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lenc_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size AES_encrypt,.-AES_encrypt +.type _x86_64_AES_decrypt,@function +.align 16 +_x86_64_AES_decrypt: +.cfi_startproc + xorl 0(%r15),%eax + xorl 4(%r15),%ebx + xorl 8(%r15),%ecx + xorl 12(%r15),%edx + + movl 240(%r15),%r13d + subl $1,%r13d + jmp .Ldec_loop +.align 16 +.Ldec_loop: + + movzbl %al,%esi + movzbl %bl,%edi + movzbl %cl,%ebp + movl 0(%r14,%rsi,8),%r10d + movl 0(%r14,%rdi,8),%r11d + movl 0(%r14,%rbp,8),%r12d + + movzbl %dh,%esi + movzbl %ah,%edi + movzbl %dl,%ebp + xorl 3(%r14,%rsi,8),%r10d + xorl 3(%r14,%rdi,8),%r11d + movl 0(%r14,%rbp,8),%r8d + + movzbl %bh,%esi + shrl $16,%eax + movzbl %ch,%ebp + xorl 3(%r14,%rsi,8),%r12d + shrl $16,%edx + xorl 3(%r14,%rbp,8),%r8d + + shrl $16,%ebx + leaq 16(%r15),%r15 + shrl $16,%ecx + + movzbl %cl,%esi + movzbl %dl,%edi + movzbl %al,%ebp + xorl 2(%r14,%rsi,8),%r10d + xorl 2(%r14,%rdi,8),%r11d + xorl 2(%r14,%rbp,8),%r12d + + movzbl %bh,%esi + movzbl %ch,%edi + movzbl %bl,%ebp + xorl 1(%r14,%rsi,8),%r10d + xorl 1(%r14,%rdi,8),%r11d + xorl 2(%r14,%rbp,8),%r8d + + movzbl %dh,%esi + movl 12(%r15),%edx + movzbl %ah,%ebp + xorl 1(%r14,%rsi,8),%r12d + movl 0(%r15),%eax + xorl 1(%r14,%rbp,8),%r8d + + xorl %r10d,%eax + movl 4(%r15),%ebx + movl 8(%r15),%ecx + xorl %r12d,%ecx + xorl %r11d,%ebx + xorl %r8d,%edx + subl $1,%r13d + jnz .Ldec_loop + leaq 2048(%r14),%r14 + movzbl %al,%esi + movzbl %bl,%edi + movzbl %cl,%ebp + movzbl (%r14,%rsi,1),%r10d + movzbl (%r14,%rdi,1),%r11d + movzbl (%r14,%rbp,1),%r12d + + movzbl %dl,%esi + movzbl %dh,%edi + movzbl %ah,%ebp + movzbl (%r14,%rsi,1),%r8d + movzbl (%r14,%rdi,1),%edi + movzbl (%r14,%rbp,1),%ebp + + shll $8,%edi + shll $8,%ebp + + xorl %edi,%r10d + xorl %ebp,%r11d + shrl $16,%edx + + movzbl %bh,%esi + movzbl %ch,%edi + shrl $16,%eax + movzbl (%r14,%rsi,1),%esi + movzbl (%r14,%rdi,1),%edi + + shll $8,%esi + shll $8,%edi + shrl $16,%ebx + xorl %esi,%r12d + xorl %edi,%r8d + shrl $16,%ecx + + movzbl %cl,%esi + movzbl %dl,%edi + movzbl %al,%ebp + movzbl (%r14,%rsi,1),%esi + movzbl (%r14,%rdi,1),%edi + movzbl (%r14,%rbp,1),%ebp + + shll $16,%esi + shll $16,%edi + shll $16,%ebp + + xorl %esi,%r10d + xorl %edi,%r11d + xorl %ebp,%r12d + + movzbl %bl,%esi + movzbl %bh,%edi + movzbl %ch,%ebp + movzbl (%r14,%rsi,1),%esi + movzbl (%r14,%rdi,1),%edi + movzbl (%r14,%rbp,1),%ebp + + shll $16,%esi + shll $24,%edi + shll $24,%ebp + + xorl %esi,%r8d + xorl %edi,%r10d + xorl %ebp,%r11d + + movzbl %dh,%esi + movzbl %ah,%edi + movl 16+12(%r15),%edx + movzbl (%r14,%rsi,1),%esi + movzbl (%r14,%rdi,1),%edi + movl 16+0(%r15),%eax + + shll $24,%esi + shll $24,%edi + + xorl %esi,%r12d + xorl %edi,%r8d + + movl 16+4(%r15),%ebx + movl 16+8(%r15),%ecx + leaq -2048(%r14),%r14 + xorl %r10d,%eax + xorl %r11d,%ebx + xorl %r12d,%ecx + xorl %r8d,%edx +.byte 0xf3,0xc3 +.cfi_endproc +.size _x86_64_AES_decrypt,.-_x86_64_AES_decrypt +.type _x86_64_AES_decrypt_compact,@function +.align 16 +_x86_64_AES_decrypt_compact: +.cfi_startproc + leaq 128(%r14),%r8 + movl 0-128(%r8),%edi + movl 32-128(%r8),%ebp + movl 64-128(%r8),%r10d + movl 96-128(%r8),%r11d + movl 128-128(%r8),%edi + movl 160-128(%r8),%ebp + movl 192-128(%r8),%r10d + movl 224-128(%r8),%r11d + jmp .Ldec_loop_compact + +.align 16 +.Ldec_loop_compact: + xorl 0(%r15),%eax + xorl 4(%r15),%ebx + xorl 8(%r15),%ecx + xorl 12(%r15),%edx + leaq 16(%r15),%r15 + movzbl %al,%r10d + movzbl %bl,%r11d + movzbl %cl,%r12d + movzbl %dl,%r8d + movzbl %dh,%esi + movzbl %ah,%edi + shrl $16,%edx + movzbl %bh,%ebp + movzbl (%r14,%r10,1),%r10d + movzbl (%r14,%r11,1),%r11d + movzbl (%r14,%r12,1),%r12d + movzbl (%r14,%r8,1),%r8d + + movzbl (%r14,%rsi,1),%r9d + movzbl %ch,%esi + movzbl (%r14,%rdi,1),%r13d + movzbl (%r14,%rbp,1),%ebp + movzbl (%r14,%rsi,1),%esi + + shrl $16,%ecx + shll $8,%r13d + shll $8,%r9d + movzbl %cl,%edi + shrl $16,%eax + xorl %r9d,%r10d + shrl $16,%ebx + movzbl %dl,%r9d + + shll $8,%ebp + xorl %r13d,%r11d + shll $8,%esi + movzbl %al,%r13d + movzbl (%r14,%rdi,1),%edi + xorl %ebp,%r12d + movzbl %bl,%ebp + + shll $16,%edi + xorl %esi,%r8d + movzbl (%r14,%r9,1),%r9d + movzbl %bh,%esi + movzbl (%r14,%rbp,1),%ebp + xorl %edi,%r10d + movzbl (%r14,%r13,1),%r13d + movzbl %ch,%edi + + shll $16,%ebp + shll $16,%r9d + shll $16,%r13d + xorl %ebp,%r8d + movzbl %dh,%ebp + xorl %r9d,%r11d + shrl $8,%eax + xorl %r13d,%r12d + + movzbl (%r14,%rsi,1),%esi + movzbl (%r14,%rdi,1),%ebx + movzbl (%r14,%rbp,1),%ecx + movzbl (%r14,%rax,1),%edx + + movl %r10d,%eax + shll $24,%esi + shll $24,%ebx + shll $24,%ecx + xorl %esi,%eax + shll $24,%edx + xorl %r11d,%ebx + xorl %r12d,%ecx + xorl %r8d,%edx + cmpq 16(%rsp),%r15 + je .Ldec_compact_done + + movq 256+0(%r14),%rsi + shlq $32,%rbx + shlq $32,%rdx + movq 256+8(%r14),%rdi + orq %rbx,%rax + orq %rdx,%rcx + movq 256+16(%r14),%rbp + movq %rsi,%r9 + movq %rsi,%r12 + andq %rax,%r9 + andq %rcx,%r12 + movq %r9,%rbx + movq %r12,%rdx + shrq $7,%r9 + leaq (%rax,%rax,1),%r8 + shrq $7,%r12 + leaq (%rcx,%rcx,1),%r11 + subq %r9,%rbx + subq %r12,%rdx + andq %rdi,%r8 + andq %rdi,%r11 + andq %rbp,%rbx + andq %rbp,%rdx + xorq %rbx,%r8 + xorq %rdx,%r11 + movq %rsi,%r10 + movq %rsi,%r13 + + andq %r8,%r10 + andq %r11,%r13 + movq %r10,%rbx + movq %r13,%rdx + shrq $7,%r10 + leaq (%r8,%r8,1),%r9 + shrq $7,%r13 + leaq (%r11,%r11,1),%r12 + subq %r10,%rbx + subq %r13,%rdx + andq %rdi,%r9 + andq %rdi,%r12 + andq %rbp,%rbx + andq %rbp,%rdx + xorq %rbx,%r9 + xorq %rdx,%r12 + movq %rsi,%r10 + movq %rsi,%r13 + + andq %r9,%r10 + andq %r12,%r13 + movq %r10,%rbx + movq %r13,%rdx + shrq $7,%r10 + xorq %rax,%r8 + shrq $7,%r13 + xorq %rcx,%r11 + subq %r10,%rbx + subq %r13,%rdx + leaq (%r9,%r9,1),%r10 + leaq (%r12,%r12,1),%r13 + xorq %rax,%r9 + xorq %rcx,%r12 + andq %rdi,%r10 + andq %rdi,%r13 + andq %rbp,%rbx + andq %rbp,%rdx + xorq %rbx,%r10 + xorq %rdx,%r13 + + xorq %r10,%rax + xorq %r13,%rcx + xorq %r10,%r8 + xorq %r13,%r11 + movq %rax,%rbx + movq %rcx,%rdx + xorq %r10,%r9 + shrq $32,%rbx + xorq %r13,%r12 + shrq $32,%rdx + xorq %r8,%r10 + roll $8,%eax + xorq %r11,%r13 + roll $8,%ecx + xorq %r9,%r10 + roll $8,%ebx + xorq %r12,%r13 + + roll $8,%edx + xorl %r10d,%eax + shrq $32,%r10 + xorl %r13d,%ecx + shrq $32,%r13 + xorl %r10d,%ebx + xorl %r13d,%edx + + movq %r8,%r10 + roll $24,%r8d + movq %r11,%r13 + roll $24,%r11d + shrq $32,%r10 + xorl %r8d,%eax + shrq $32,%r13 + xorl %r11d,%ecx + roll $24,%r10d + movq %r9,%r8 + roll $24,%r13d + movq %r12,%r11 + shrq $32,%r8 + xorl %r10d,%ebx + shrq $32,%r11 + xorl %r13d,%edx + + movq 0(%r14),%rsi + roll $16,%r9d + movq 64(%r14),%rdi + roll $16,%r12d + movq 128(%r14),%rbp + roll $16,%r8d + movq 192(%r14),%r10 + xorl %r9d,%eax + roll $16,%r11d + xorl %r12d,%ecx + movq 256(%r14),%r13 + xorl %r8d,%ebx + xorl %r11d,%edx + jmp .Ldec_loop_compact +.align 16 +.Ldec_compact_done: + xorl 0(%r15),%eax + xorl 4(%r15),%ebx + xorl 8(%r15),%ecx + xorl 12(%r15),%edx +.byte 0xf3,0xc3 +.cfi_endproc +.size _x86_64_AES_decrypt_compact,.-_x86_64_AES_decrypt_compact +.globl AES_decrypt +.type AES_decrypt,@function +.align 16 +.globl asm_AES_decrypt +.hidden asm_AES_decrypt +asm_AES_decrypt: +AES_decrypt: +.cfi_startproc +.byte 243,15,30,250 + movq %rsp,%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + + + leaq -63(%rdx),%rcx + andq $-64,%rsp + subq %rsp,%rcx + negq %rcx + andq $0x3c0,%rcx + subq %rcx,%rsp + subq $32,%rsp + + movq %rsi,16(%rsp) + movq %rax,24(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x18,0x06,0x23,0x08 +.Ldec_prologue: + + movq %rdx,%r15 + movl 240(%r15),%r13d + + movl 0(%rdi),%eax + movl 4(%rdi),%ebx + movl 8(%rdi),%ecx + movl 12(%rdi),%edx + + shll $4,%r13d + leaq (%r15,%r13,1),%rbp + movq %r15,(%rsp) + movq %rbp,8(%rsp) + + + leaq .LAES_Td+2048(%rip),%r14 + leaq 768(%rsp),%rbp + subq %r14,%rbp + andq $0x300,%rbp + leaq (%r14,%rbp,1),%r14 + shrq $3,%rbp + addq %rbp,%r14 + + call _x86_64_AES_decrypt_compact + + movq 16(%rsp),%r9 + movq 24(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movl %eax,0(%r9) + movl %ebx,4(%r9) + movl %ecx,8(%r9) + movl %edx,12(%r9) + + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Ldec_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size AES_decrypt,.-AES_decrypt +.globl AES_set_encrypt_key +.type AES_set_encrypt_key,@function +.align 16 +AES_set_encrypt_key: +.cfi_startproc +.byte 243,15,30,250 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 +.Lenc_key_prologue: + + call _x86_64_AES_set_encrypt_key + + movq 40(%rsp),%rbp +.cfi_restore %rbp + movq 48(%rsp),%rbx +.cfi_restore %rbx + addq $56,%rsp +.cfi_adjust_cfa_offset -56 +.Lenc_key_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size AES_set_encrypt_key,.-AES_set_encrypt_key + +.type _x86_64_AES_set_encrypt_key,@function +.align 16 +_x86_64_AES_set_encrypt_key: +.cfi_startproc + movl %esi,%ecx + movq %rdi,%rsi + movq %rdx,%rdi + + testq $-1,%rsi + jz .Lbadpointer + testq $-1,%rdi + jz .Lbadpointer + + leaq .LAES_Te(%rip),%rbp + leaq 2048+128(%rbp),%rbp + + + movl 0-128(%rbp),%eax + movl 32-128(%rbp),%ebx + movl 64-128(%rbp),%r8d + movl 96-128(%rbp),%edx + movl 128-128(%rbp),%eax + movl 160-128(%rbp),%ebx + movl 192-128(%rbp),%r8d + movl 224-128(%rbp),%edx + + cmpl $128,%ecx + je .L10rounds + cmpl $192,%ecx + je .L12rounds + cmpl $256,%ecx + je .L14rounds + movq $-2,%rax + jmp .Lexit + +.L10rounds: + movq 0(%rsi),%rax + movq 8(%rsi),%rdx + movq %rax,0(%rdi) + movq %rdx,8(%rdi) + + shrq $32,%rdx + xorl %ecx,%ecx + jmp .L10shortcut +.align 4 +.L10loop: + movl 0(%rdi),%eax + movl 12(%rdi),%edx +.L10shortcut: + movzbl %dl,%esi + movzbl -128(%rbp,%rsi,1),%ebx + movzbl %dh,%esi + shll $24,%ebx + xorl %ebx,%eax + + movzbl -128(%rbp,%rsi,1),%ebx + shrl $16,%edx + movzbl %dl,%esi + xorl %ebx,%eax + + movzbl -128(%rbp,%rsi,1),%ebx + movzbl %dh,%esi + shll $8,%ebx + xorl %ebx,%eax + + movzbl -128(%rbp,%rsi,1),%ebx + shll $16,%ebx + xorl %ebx,%eax + + xorl 1024-128(%rbp,%rcx,4),%eax + movl %eax,16(%rdi) + xorl 4(%rdi),%eax + movl %eax,20(%rdi) + xorl 8(%rdi),%eax + movl %eax,24(%rdi) + xorl 12(%rdi),%eax + movl %eax,28(%rdi) + addl $1,%ecx + leaq 16(%rdi),%rdi + cmpl $10,%ecx + jl .L10loop + + movl $10,80(%rdi) + xorq %rax,%rax + jmp .Lexit + +.L12rounds: + movq 0(%rsi),%rax + movq 8(%rsi),%rbx + movq 16(%rsi),%rdx + movq %rax,0(%rdi) + movq %rbx,8(%rdi) + movq %rdx,16(%rdi) + + shrq $32,%rdx + xorl %ecx,%ecx + jmp .L12shortcut +.align 4 +.L12loop: + movl 0(%rdi),%eax + movl 20(%rdi),%edx +.L12shortcut: + movzbl %dl,%esi + movzbl -128(%rbp,%rsi,1),%ebx + movzbl %dh,%esi + shll $24,%ebx + xorl %ebx,%eax + + movzbl -128(%rbp,%rsi,1),%ebx + shrl $16,%edx + movzbl %dl,%esi + xorl %ebx,%eax + + movzbl -128(%rbp,%rsi,1),%ebx + movzbl %dh,%esi + shll $8,%ebx + xorl %ebx,%eax + + movzbl -128(%rbp,%rsi,1),%ebx + shll $16,%ebx + xorl %ebx,%eax + + xorl 1024-128(%rbp,%rcx,4),%eax + movl %eax,24(%rdi) + xorl 4(%rdi),%eax + movl %eax,28(%rdi) + xorl 8(%rdi),%eax + movl %eax,32(%rdi) + xorl 12(%rdi),%eax + movl %eax,36(%rdi) + + cmpl $7,%ecx + je .L12break + addl $1,%ecx + + xorl 16(%rdi),%eax + movl %eax,40(%rdi) + xorl 20(%rdi),%eax + movl %eax,44(%rdi) + + leaq 24(%rdi),%rdi + jmp .L12loop +.L12break: + movl $12,72(%rdi) + xorq %rax,%rax + jmp .Lexit + +.L14rounds: + movq 0(%rsi),%rax + movq 8(%rsi),%rbx + movq 16(%rsi),%rcx + movq 24(%rsi),%rdx + movq %rax,0(%rdi) + movq %rbx,8(%rdi) + movq %rcx,16(%rdi) + movq %rdx,24(%rdi) + + shrq $32,%rdx + xorl %ecx,%ecx + jmp .L14shortcut +.align 4 +.L14loop: + movl 0(%rdi),%eax + movl 28(%rdi),%edx +.L14shortcut: + movzbl %dl,%esi + movzbl -128(%rbp,%rsi,1),%ebx + movzbl %dh,%esi + shll $24,%ebx + xorl %ebx,%eax + + movzbl -128(%rbp,%rsi,1),%ebx + shrl $16,%edx + movzbl %dl,%esi + xorl %ebx,%eax + + movzbl -128(%rbp,%rsi,1),%ebx + movzbl %dh,%esi + shll $8,%ebx + xorl %ebx,%eax + + movzbl -128(%rbp,%rsi,1),%ebx + shll $16,%ebx + xorl %ebx,%eax + + xorl 1024-128(%rbp,%rcx,4),%eax + movl %eax,32(%rdi) + xorl 4(%rdi),%eax + movl %eax,36(%rdi) + xorl 8(%rdi),%eax + movl %eax,40(%rdi) + xorl 12(%rdi),%eax + movl %eax,44(%rdi) + + cmpl $6,%ecx + je .L14break + addl $1,%ecx + + movl %eax,%edx + movl 16(%rdi),%eax + movzbl %dl,%esi + movzbl -128(%rbp,%rsi,1),%ebx + movzbl %dh,%esi + xorl %ebx,%eax + + movzbl -128(%rbp,%rsi,1),%ebx + shrl $16,%edx + shll $8,%ebx + movzbl %dl,%esi + xorl %ebx,%eax + + movzbl -128(%rbp,%rsi,1),%ebx + movzbl %dh,%esi + shll $16,%ebx + xorl %ebx,%eax + + movzbl -128(%rbp,%rsi,1),%ebx + shll $24,%ebx + xorl %ebx,%eax + + movl %eax,48(%rdi) + xorl 20(%rdi),%eax + movl %eax,52(%rdi) + xorl 24(%rdi),%eax + movl %eax,56(%rdi) + xorl 28(%rdi),%eax + movl %eax,60(%rdi) + + leaq 32(%rdi),%rdi + jmp .L14loop +.L14break: + movl $14,48(%rdi) + xorq %rax,%rax + jmp .Lexit + +.Lbadpointer: + movq $-1,%rax +.Lexit: +.byte 0xf3,0xc3 +.cfi_endproc +.size _x86_64_AES_set_encrypt_key,.-_x86_64_AES_set_encrypt_key +.globl AES_set_decrypt_key +.type AES_set_decrypt_key,@function +.align 16 +AES_set_decrypt_key: +.cfi_startproc +.byte 243,15,30,250 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rdx +.cfi_adjust_cfa_offset 8 +.Ldec_key_prologue: + + call _x86_64_AES_set_encrypt_key + movq (%rsp),%r8 + cmpl $0,%eax + jne .Labort + + movl 240(%r8),%r14d + xorq %rdi,%rdi + leaq (%rdi,%r14,4),%rcx + movq %r8,%rsi + leaq (%r8,%rcx,4),%rdi +.align 4 +.Linvert: + movq 0(%rsi),%rax + movq 8(%rsi),%rbx + movq 0(%rdi),%rcx + movq 8(%rdi),%rdx + movq %rax,0(%rdi) + movq %rbx,8(%rdi) + movq %rcx,0(%rsi) + movq %rdx,8(%rsi) + leaq 16(%rsi),%rsi + leaq -16(%rdi),%rdi + cmpq %rsi,%rdi + jne .Linvert + + leaq .LAES_Te+2048+1024(%rip),%rax + + movq 40(%rax),%rsi + movq 48(%rax),%rdi + movq 56(%rax),%rbp + + movq %r8,%r15 + subl $1,%r14d +.align 4 +.Lpermute: + leaq 16(%r15),%r15 + movq 0(%r15),%rax + movq 8(%r15),%rcx + movq %rsi,%r9 + movq %rsi,%r12 + andq %rax,%r9 + andq %rcx,%r12 + movq %r9,%rbx + movq %r12,%rdx + shrq $7,%r9 + leaq (%rax,%rax,1),%r8 + shrq $7,%r12 + leaq (%rcx,%rcx,1),%r11 + subq %r9,%rbx + subq %r12,%rdx + andq %rdi,%r8 + andq %rdi,%r11 + andq %rbp,%rbx + andq %rbp,%rdx + xorq %rbx,%r8 + xorq %rdx,%r11 + movq %rsi,%r10 + movq %rsi,%r13 + + andq %r8,%r10 + andq %r11,%r13 + movq %r10,%rbx + movq %r13,%rdx + shrq $7,%r10 + leaq (%r8,%r8,1),%r9 + shrq $7,%r13 + leaq (%r11,%r11,1),%r12 + subq %r10,%rbx + subq %r13,%rdx + andq %rdi,%r9 + andq %rdi,%r12 + andq %rbp,%rbx + andq %rbp,%rdx + xorq %rbx,%r9 + xorq %rdx,%r12 + movq %rsi,%r10 + movq %rsi,%r13 + + andq %r9,%r10 + andq %r12,%r13 + movq %r10,%rbx + movq %r13,%rdx + shrq $7,%r10 + xorq %rax,%r8 + shrq $7,%r13 + xorq %rcx,%r11 + subq %r10,%rbx + subq %r13,%rdx + leaq (%r9,%r9,1),%r10 + leaq (%r12,%r12,1),%r13 + xorq %rax,%r9 + xorq %rcx,%r12 + andq %rdi,%r10 + andq %rdi,%r13 + andq %rbp,%rbx + andq %rbp,%rdx + xorq %rbx,%r10 + xorq %rdx,%r13 + + xorq %r10,%rax + xorq %r13,%rcx + xorq %r10,%r8 + xorq %r13,%r11 + movq %rax,%rbx + movq %rcx,%rdx + xorq %r10,%r9 + shrq $32,%rbx + xorq %r13,%r12 + shrq $32,%rdx + xorq %r8,%r10 + roll $8,%eax + xorq %r11,%r13 + roll $8,%ecx + xorq %r9,%r10 + roll $8,%ebx + xorq %r12,%r13 + + roll $8,%edx + xorl %r10d,%eax + shrq $32,%r10 + xorl %r13d,%ecx + shrq $32,%r13 + xorl %r10d,%ebx + xorl %r13d,%edx + + movq %r8,%r10 + roll $24,%r8d + movq %r11,%r13 + roll $24,%r11d + shrq $32,%r10 + xorl %r8d,%eax + shrq $32,%r13 + xorl %r11d,%ecx + roll $24,%r10d + movq %r9,%r8 + roll $24,%r13d + movq %r12,%r11 + shrq $32,%r8 + xorl %r10d,%ebx + shrq $32,%r11 + xorl %r13d,%edx + + + roll $16,%r9d + + roll $16,%r12d + + roll $16,%r8d + + xorl %r9d,%eax + roll $16,%r11d + xorl %r12d,%ecx + + xorl %r8d,%ebx + xorl %r11d,%edx + movl %eax,0(%r15) + movl %ebx,4(%r15) + movl %ecx,8(%r15) + movl %edx,12(%r15) + subl $1,%r14d + jnz .Lpermute + + xorq %rax,%rax +.Labort: + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbp +.cfi_restore %rbp + movq 48(%rsp),%rbx +.cfi_restore %rbx + addq $56,%rsp +.cfi_adjust_cfa_offset -56 +.Ldec_key_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size AES_set_decrypt_key,.-AES_set_decrypt_key +.globl AES_cbc_encrypt +.type AES_cbc_encrypt,@function +.align 16 + +.globl asm_AES_cbc_encrypt +.hidden asm_AES_cbc_encrypt +asm_AES_cbc_encrypt: +AES_cbc_encrypt: +.cfi_startproc +.byte 243,15,30,250 + cmpq $0,%rdx + je .Lcbc_epilogue + pushfq + + +.cfi_adjust_cfa_offset 8 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-32 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-40 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-48 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-56 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-64 +.Lcbc_prologue: + + cld + movl %r9d,%r9d + + leaq .LAES_Te(%rip),%r14 + leaq .LAES_Td(%rip),%r10 + cmpq $0,%r9 + cmoveq %r10,%r14 + +.cfi_remember_state + movl OPENSSL_ia32cap_P(%rip),%r10d + cmpq $512,%rdx + jb .Lcbc_slow_prologue + testq $15,%rdx + jnz .Lcbc_slow_prologue + btl $28,%r10d + jc .Lcbc_slow_prologue + + + leaq -88-248(%rsp),%r15 + andq $-64,%r15 + + + movq %r14,%r10 + leaq 2304(%r14),%r11 + movq %r15,%r12 + andq $0xFFF,%r10 + andq $0xFFF,%r11 + andq $0xFFF,%r12 + + cmpq %r11,%r12 + jb .Lcbc_te_break_out + subq %r11,%r12 + subq %r12,%r15 + jmp .Lcbc_te_ok +.Lcbc_te_break_out: + subq %r10,%r12 + andq $0xFFF,%r12 + addq $320,%r12 + subq %r12,%r15 +.align 4 +.Lcbc_te_ok: + + xchgq %rsp,%r15 +.cfi_def_cfa_register %r15 + + movq %r15,16(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x10,0x06,0x23,0x40 +.Lcbc_fast_body: + movq %rdi,24(%rsp) + movq %rsi,32(%rsp) + movq %rdx,40(%rsp) + movq %rcx,48(%rsp) + movq %r8,56(%rsp) + movl $0,80+240(%rsp) + movq %r8,%rbp + movq %r9,%rbx + movq %rsi,%r9 + movq %rdi,%r8 + movq %rcx,%r15 + + movl 240(%r15),%eax + + movq %r15,%r10 + subq %r14,%r10 + andq $0xfff,%r10 + cmpq $2304,%r10 + jb .Lcbc_do_ecopy + cmpq $4096-248,%r10 + jb .Lcbc_skip_ecopy +.align 4 +.Lcbc_do_ecopy: + movq %r15,%rsi + leaq 80(%rsp),%rdi + leaq 80(%rsp),%r15 + movl $30,%ecx +.long 0x90A548F3 + movl %eax,(%rdi) +.Lcbc_skip_ecopy: + movq %r15,0(%rsp) + + movl $18,%ecx +.align 4 +.Lcbc_prefetch_te: + movq 0(%r14),%r10 + movq 32(%r14),%r11 + movq 64(%r14),%r12 + movq 96(%r14),%r13 + leaq 128(%r14),%r14 + subl $1,%ecx + jnz .Lcbc_prefetch_te + leaq -2304(%r14),%r14 + + cmpq $0,%rbx + je .LFAST_DECRYPT + + + movl 0(%rbp),%eax + movl 4(%rbp),%ebx + movl 8(%rbp),%ecx + movl 12(%rbp),%edx + +.align 4 +.Lcbc_fast_enc_loop: + xorl 0(%r8),%eax + xorl 4(%r8),%ebx + xorl 8(%r8),%ecx + xorl 12(%r8),%edx + movq 0(%rsp),%r15 + movq %r8,24(%rsp) + + call _x86_64_AES_encrypt + + movq 24(%rsp),%r8 + movq 40(%rsp),%r10 + movl %eax,0(%r9) + movl %ebx,4(%r9) + movl %ecx,8(%r9) + movl %edx,12(%r9) + + leaq 16(%r8),%r8 + leaq 16(%r9),%r9 + subq $16,%r10 + testq $-16,%r10 + movq %r10,40(%rsp) + jnz .Lcbc_fast_enc_loop + movq 56(%rsp),%rbp + movl %eax,0(%rbp) + movl %ebx,4(%rbp) + movl %ecx,8(%rbp) + movl %edx,12(%rbp) + + jmp .Lcbc_fast_cleanup + + +.align 16 +.LFAST_DECRYPT: + cmpq %r8,%r9 + je .Lcbc_fast_dec_in_place + + movq %rbp,64(%rsp) +.align 4 +.Lcbc_fast_dec_loop: + movl 0(%r8),%eax + movl 4(%r8),%ebx + movl 8(%r8),%ecx + movl 12(%r8),%edx + movq 0(%rsp),%r15 + movq %r8,24(%rsp) + + call _x86_64_AES_decrypt + + movq 64(%rsp),%rbp + movq 24(%rsp),%r8 + movq 40(%rsp),%r10 + xorl 0(%rbp),%eax + xorl 4(%rbp),%ebx + xorl 8(%rbp),%ecx + xorl 12(%rbp),%edx + movq %r8,%rbp + + subq $16,%r10 + movq %r10,40(%rsp) + movq %rbp,64(%rsp) + + movl %eax,0(%r9) + movl %ebx,4(%r9) + movl %ecx,8(%r9) + movl %edx,12(%r9) + + leaq 16(%r8),%r8 + leaq 16(%r9),%r9 + jnz .Lcbc_fast_dec_loop + movq 56(%rsp),%r12 + movq 0(%rbp),%r10 + movq 8(%rbp),%r11 + movq %r10,0(%r12) + movq %r11,8(%r12) + jmp .Lcbc_fast_cleanup + +.align 16 +.Lcbc_fast_dec_in_place: + movq 0(%rbp),%r10 + movq 8(%rbp),%r11 + movq %r10,0+64(%rsp) + movq %r11,8+64(%rsp) +.align 4 +.Lcbc_fast_dec_in_place_loop: + movl 0(%r8),%eax + movl 4(%r8),%ebx + movl 8(%r8),%ecx + movl 12(%r8),%edx + movq 0(%rsp),%r15 + movq %r8,24(%rsp) + + call _x86_64_AES_decrypt + + movq 24(%rsp),%r8 + movq 40(%rsp),%r10 + xorl 0+64(%rsp),%eax + xorl 4+64(%rsp),%ebx + xorl 8+64(%rsp),%ecx + xorl 12+64(%rsp),%edx + + movq 0(%r8),%r11 + movq 8(%r8),%r12 + subq $16,%r10 + jz .Lcbc_fast_dec_in_place_done + + movq %r11,0+64(%rsp) + movq %r12,8+64(%rsp) + + movl %eax,0(%r9) + movl %ebx,4(%r9) + movl %ecx,8(%r9) + movl %edx,12(%r9) + + leaq 16(%r8),%r8 + leaq 16(%r9),%r9 + movq %r10,40(%rsp) + jmp .Lcbc_fast_dec_in_place_loop +.Lcbc_fast_dec_in_place_done: + movq 56(%rsp),%rdi + movq %r11,0(%rdi) + movq %r12,8(%rdi) + + movl %eax,0(%r9) + movl %ebx,4(%r9) + movl %ecx,8(%r9) + movl %edx,12(%r9) + +.align 4 +.Lcbc_fast_cleanup: + cmpl $0,80+240(%rsp) + leaq 80(%rsp),%rdi + je .Lcbc_exit + movl $30,%ecx + xorq %rax,%rax +.long 0x90AB48F3 + + jmp .Lcbc_exit + + +.align 16 +.Lcbc_slow_prologue: +.cfi_restore_state + + leaq -88(%rsp),%rbp + andq $-64,%rbp + + leaq -88-63(%rcx),%r10 + subq %rbp,%r10 + negq %r10 + andq $0x3c0,%r10 + subq %r10,%rbp + + xchgq %rsp,%rbp +.cfi_def_cfa_register %rbp + + movq %rbp,16(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x10,0x06,0x23,0x40 +.Lcbc_slow_body: + + + + + movq %r8,56(%rsp) + movq %r8,%rbp + movq %r9,%rbx + movq %rsi,%r9 + movq %rdi,%r8 + movq %rcx,%r15 + movq %rdx,%r10 + + movl 240(%r15),%eax + movq %r15,0(%rsp) + shll $4,%eax + leaq (%r15,%rax,1),%rax + movq %rax,8(%rsp) + + + leaq 2048(%r14),%r14 + leaq 768-8(%rsp),%rax + subq %r14,%rax + andq $0x300,%rax + leaq (%r14,%rax,1),%r14 + + cmpq $0,%rbx + je .LSLOW_DECRYPT + + + testq $-16,%r10 + movl 0(%rbp),%eax + movl 4(%rbp),%ebx + movl 8(%rbp),%ecx + movl 12(%rbp),%edx + jz .Lcbc_slow_enc_tail + +.align 4 +.Lcbc_slow_enc_loop: + xorl 0(%r8),%eax + xorl 4(%r8),%ebx + xorl 8(%r8),%ecx + xorl 12(%r8),%edx + movq 0(%rsp),%r15 + movq %r8,24(%rsp) + movq %r9,32(%rsp) + movq %r10,40(%rsp) + + call _x86_64_AES_encrypt_compact + + movq 24(%rsp),%r8 + movq 32(%rsp),%r9 + movq 40(%rsp),%r10 + movl %eax,0(%r9) + movl %ebx,4(%r9) + movl %ecx,8(%r9) + movl %edx,12(%r9) + + leaq 16(%r8),%r8 + leaq 16(%r9),%r9 + subq $16,%r10 + testq $-16,%r10 + jnz .Lcbc_slow_enc_loop + testq $15,%r10 + jnz .Lcbc_slow_enc_tail + movq 56(%rsp),%rbp + movl %eax,0(%rbp) + movl %ebx,4(%rbp) + movl %ecx,8(%rbp) + movl %edx,12(%rbp) + + jmp .Lcbc_exit + +.align 4 +.Lcbc_slow_enc_tail: + movq %rax,%r11 + movq %rcx,%r12 + movq %r10,%rcx + movq %r8,%rsi + movq %r9,%rdi +.long 0x9066A4F3 + movq $16,%rcx + subq %r10,%rcx + xorq %rax,%rax +.long 0x9066AAF3 + movq %r9,%r8 + movq $16,%r10 + movq %r11,%rax + movq %r12,%rcx + jmp .Lcbc_slow_enc_loop + +.align 16 +.LSLOW_DECRYPT: + shrq $3,%rax + addq %rax,%r14 + + movq 0(%rbp),%r11 + movq 8(%rbp),%r12 + movq %r11,0+64(%rsp) + movq %r12,8+64(%rsp) + +.align 4 +.Lcbc_slow_dec_loop: + movl 0(%r8),%eax + movl 4(%r8),%ebx + movl 8(%r8),%ecx + movl 12(%r8),%edx + movq 0(%rsp),%r15 + movq %r8,24(%rsp) + movq %r9,32(%rsp) + movq %r10,40(%rsp) + + call _x86_64_AES_decrypt_compact + + movq 24(%rsp),%r8 + movq 32(%rsp),%r9 + movq 40(%rsp),%r10 + xorl 0+64(%rsp),%eax + xorl 4+64(%rsp),%ebx + xorl 8+64(%rsp),%ecx + xorl 12+64(%rsp),%edx + + movq 0(%r8),%r11 + movq 8(%r8),%r12 + subq $16,%r10 + jc .Lcbc_slow_dec_partial + jz .Lcbc_slow_dec_done + + movq %r11,0+64(%rsp) + movq %r12,8+64(%rsp) + + movl %eax,0(%r9) + movl %ebx,4(%r9) + movl %ecx,8(%r9) + movl %edx,12(%r9) + + leaq 16(%r8),%r8 + leaq 16(%r9),%r9 + jmp .Lcbc_slow_dec_loop +.Lcbc_slow_dec_done: + movq 56(%rsp),%rdi + movq %r11,0(%rdi) + movq %r12,8(%rdi) + + movl %eax,0(%r9) + movl %ebx,4(%r9) + movl %ecx,8(%r9) + movl %edx,12(%r9) + + jmp .Lcbc_exit + +.align 4 +.Lcbc_slow_dec_partial: + movq 56(%rsp),%rdi + movq %r11,0(%rdi) + movq %r12,8(%rdi) + + movl %eax,0+64(%rsp) + movl %ebx,4+64(%rsp) + movl %ecx,8+64(%rsp) + movl %edx,12+64(%rsp) + + movq %r9,%rdi + leaq 64(%rsp),%rsi + leaq 16(%r10),%rcx +.long 0x9066A4F3 + jmp .Lcbc_exit + +.align 16 +.Lcbc_exit: + movq 16(%rsp),%rsi +.cfi_def_cfa %rsi,64 + movq (%rsi),%r15 +.cfi_restore %r15 + movq 8(%rsi),%r14 +.cfi_restore %r14 + movq 16(%rsi),%r13 +.cfi_restore %r13 + movq 24(%rsi),%r12 +.cfi_restore %r12 + movq 32(%rsi),%rbp +.cfi_restore %rbp + movq 40(%rsi),%rbx +.cfi_restore %rbx + leaq 48(%rsi),%rsp +.cfi_def_cfa %rsp,16 +.Lcbc_popfq: + popfq + + +.cfi_adjust_cfa_offset -8 +.Lcbc_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size AES_cbc_encrypt,.-AES_cbc_encrypt +.align 64 +.LAES_Te: +.long 0xa56363c6,0xa56363c6 +.long 0x847c7cf8,0x847c7cf8 +.long 0x997777ee,0x997777ee +.long 0x8d7b7bf6,0x8d7b7bf6 +.long 0x0df2f2ff,0x0df2f2ff +.long 0xbd6b6bd6,0xbd6b6bd6 +.long 0xb16f6fde,0xb16f6fde +.long 0x54c5c591,0x54c5c591 +.long 0x50303060,0x50303060 +.long 0x03010102,0x03010102 +.long 0xa96767ce,0xa96767ce +.long 0x7d2b2b56,0x7d2b2b56 +.long 0x19fefee7,0x19fefee7 +.long 0x62d7d7b5,0x62d7d7b5 +.long 0xe6abab4d,0xe6abab4d +.long 0x9a7676ec,0x9a7676ec +.long 0x45caca8f,0x45caca8f +.long 0x9d82821f,0x9d82821f +.long 0x40c9c989,0x40c9c989 +.long 0x877d7dfa,0x877d7dfa +.long 0x15fafaef,0x15fafaef +.long 0xeb5959b2,0xeb5959b2 +.long 0xc947478e,0xc947478e +.long 0x0bf0f0fb,0x0bf0f0fb +.long 0xecadad41,0xecadad41 +.long 0x67d4d4b3,0x67d4d4b3 +.long 0xfda2a25f,0xfda2a25f +.long 0xeaafaf45,0xeaafaf45 +.long 0xbf9c9c23,0xbf9c9c23 +.long 0xf7a4a453,0xf7a4a453 +.long 0x967272e4,0x967272e4 +.long 0x5bc0c09b,0x5bc0c09b +.long 0xc2b7b775,0xc2b7b775 +.long 0x1cfdfde1,0x1cfdfde1 +.long 0xae93933d,0xae93933d +.long 0x6a26264c,0x6a26264c +.long 0x5a36366c,0x5a36366c +.long 0x413f3f7e,0x413f3f7e +.long 0x02f7f7f5,0x02f7f7f5 +.long 0x4fcccc83,0x4fcccc83 +.long 0x5c343468,0x5c343468 +.long 0xf4a5a551,0xf4a5a551 +.long 0x34e5e5d1,0x34e5e5d1 +.long 0x08f1f1f9,0x08f1f1f9 +.long 0x937171e2,0x937171e2 +.long 0x73d8d8ab,0x73d8d8ab +.long 0x53313162,0x53313162 +.long 0x3f15152a,0x3f15152a +.long 0x0c040408,0x0c040408 +.long 0x52c7c795,0x52c7c795 +.long 0x65232346,0x65232346 +.long 0x5ec3c39d,0x5ec3c39d +.long 0x28181830,0x28181830 +.long 0xa1969637,0xa1969637 +.long 0x0f05050a,0x0f05050a +.long 0xb59a9a2f,0xb59a9a2f +.long 0x0907070e,0x0907070e +.long 0x36121224,0x36121224 +.long 0x9b80801b,0x9b80801b +.long 0x3de2e2df,0x3de2e2df +.long 0x26ebebcd,0x26ebebcd +.long 0x6927274e,0x6927274e +.long 0xcdb2b27f,0xcdb2b27f +.long 0x9f7575ea,0x9f7575ea +.long 0x1b090912,0x1b090912 +.long 0x9e83831d,0x9e83831d +.long 0x742c2c58,0x742c2c58 +.long 0x2e1a1a34,0x2e1a1a34 +.long 0x2d1b1b36,0x2d1b1b36 +.long 0xb26e6edc,0xb26e6edc +.long 0xee5a5ab4,0xee5a5ab4 +.long 0xfba0a05b,0xfba0a05b +.long 0xf65252a4,0xf65252a4 +.long 0x4d3b3b76,0x4d3b3b76 +.long 0x61d6d6b7,0x61d6d6b7 +.long 0xceb3b37d,0xceb3b37d +.long 0x7b292952,0x7b292952 +.long 0x3ee3e3dd,0x3ee3e3dd +.long 0x712f2f5e,0x712f2f5e +.long 0x97848413,0x97848413 +.long 0xf55353a6,0xf55353a6 +.long 0x68d1d1b9,0x68d1d1b9 +.long 0x00000000,0x00000000 +.long 0x2cededc1,0x2cededc1 +.long 0x60202040,0x60202040 +.long 0x1ffcfce3,0x1ffcfce3 +.long 0xc8b1b179,0xc8b1b179 +.long 0xed5b5bb6,0xed5b5bb6 +.long 0xbe6a6ad4,0xbe6a6ad4 +.long 0x46cbcb8d,0x46cbcb8d +.long 0xd9bebe67,0xd9bebe67 +.long 0x4b393972,0x4b393972 +.long 0xde4a4a94,0xde4a4a94 +.long 0xd44c4c98,0xd44c4c98 +.long 0xe85858b0,0xe85858b0 +.long 0x4acfcf85,0x4acfcf85 +.long 0x6bd0d0bb,0x6bd0d0bb +.long 0x2aefefc5,0x2aefefc5 +.long 0xe5aaaa4f,0xe5aaaa4f +.long 0x16fbfbed,0x16fbfbed +.long 0xc5434386,0xc5434386 +.long 0xd74d4d9a,0xd74d4d9a +.long 0x55333366,0x55333366 +.long 0x94858511,0x94858511 +.long 0xcf45458a,0xcf45458a +.long 0x10f9f9e9,0x10f9f9e9 +.long 0x06020204,0x06020204 +.long 0x817f7ffe,0x817f7ffe +.long 0xf05050a0,0xf05050a0 +.long 0x443c3c78,0x443c3c78 +.long 0xba9f9f25,0xba9f9f25 +.long 0xe3a8a84b,0xe3a8a84b +.long 0xf35151a2,0xf35151a2 +.long 0xfea3a35d,0xfea3a35d +.long 0xc0404080,0xc0404080 +.long 0x8a8f8f05,0x8a8f8f05 +.long 0xad92923f,0xad92923f +.long 0xbc9d9d21,0xbc9d9d21 +.long 0x48383870,0x48383870 +.long 0x04f5f5f1,0x04f5f5f1 +.long 0xdfbcbc63,0xdfbcbc63 +.long 0xc1b6b677,0xc1b6b677 +.long 0x75dadaaf,0x75dadaaf +.long 0x63212142,0x63212142 +.long 0x30101020,0x30101020 +.long 0x1affffe5,0x1affffe5 +.long 0x0ef3f3fd,0x0ef3f3fd +.long 0x6dd2d2bf,0x6dd2d2bf +.long 0x4ccdcd81,0x4ccdcd81 +.long 0x140c0c18,0x140c0c18 +.long 0x35131326,0x35131326 +.long 0x2fececc3,0x2fececc3 +.long 0xe15f5fbe,0xe15f5fbe +.long 0xa2979735,0xa2979735 +.long 0xcc444488,0xcc444488 +.long 0x3917172e,0x3917172e +.long 0x57c4c493,0x57c4c493 +.long 0xf2a7a755,0xf2a7a755 +.long 0x827e7efc,0x827e7efc +.long 0x473d3d7a,0x473d3d7a +.long 0xac6464c8,0xac6464c8 +.long 0xe75d5dba,0xe75d5dba +.long 0x2b191932,0x2b191932 +.long 0x957373e6,0x957373e6 +.long 0xa06060c0,0xa06060c0 +.long 0x98818119,0x98818119 +.long 0xd14f4f9e,0xd14f4f9e +.long 0x7fdcdca3,0x7fdcdca3 +.long 0x66222244,0x66222244 +.long 0x7e2a2a54,0x7e2a2a54 +.long 0xab90903b,0xab90903b +.long 0x8388880b,0x8388880b +.long 0xca46468c,0xca46468c +.long 0x29eeeec7,0x29eeeec7 +.long 0xd3b8b86b,0xd3b8b86b +.long 0x3c141428,0x3c141428 +.long 0x79dedea7,0x79dedea7 +.long 0xe25e5ebc,0xe25e5ebc +.long 0x1d0b0b16,0x1d0b0b16 +.long 0x76dbdbad,0x76dbdbad +.long 0x3be0e0db,0x3be0e0db +.long 0x56323264,0x56323264 +.long 0x4e3a3a74,0x4e3a3a74 +.long 0x1e0a0a14,0x1e0a0a14 +.long 0xdb494992,0xdb494992 +.long 0x0a06060c,0x0a06060c +.long 0x6c242448,0x6c242448 +.long 0xe45c5cb8,0xe45c5cb8 +.long 0x5dc2c29f,0x5dc2c29f +.long 0x6ed3d3bd,0x6ed3d3bd +.long 0xefacac43,0xefacac43 +.long 0xa66262c4,0xa66262c4 +.long 0xa8919139,0xa8919139 +.long 0xa4959531,0xa4959531 +.long 0x37e4e4d3,0x37e4e4d3 +.long 0x8b7979f2,0x8b7979f2 +.long 0x32e7e7d5,0x32e7e7d5 +.long 0x43c8c88b,0x43c8c88b +.long 0x5937376e,0x5937376e +.long 0xb76d6dda,0xb76d6dda +.long 0x8c8d8d01,0x8c8d8d01 +.long 0x64d5d5b1,0x64d5d5b1 +.long 0xd24e4e9c,0xd24e4e9c +.long 0xe0a9a949,0xe0a9a949 +.long 0xb46c6cd8,0xb46c6cd8 +.long 0xfa5656ac,0xfa5656ac +.long 0x07f4f4f3,0x07f4f4f3 +.long 0x25eaeacf,0x25eaeacf +.long 0xaf6565ca,0xaf6565ca +.long 0x8e7a7af4,0x8e7a7af4 +.long 0xe9aeae47,0xe9aeae47 +.long 0x18080810,0x18080810 +.long 0xd5baba6f,0xd5baba6f +.long 0x887878f0,0x887878f0 +.long 0x6f25254a,0x6f25254a +.long 0x722e2e5c,0x722e2e5c +.long 0x241c1c38,0x241c1c38 +.long 0xf1a6a657,0xf1a6a657 +.long 0xc7b4b473,0xc7b4b473 +.long 0x51c6c697,0x51c6c697 +.long 0x23e8e8cb,0x23e8e8cb +.long 0x7cdddda1,0x7cdddda1 +.long 0x9c7474e8,0x9c7474e8 +.long 0x211f1f3e,0x211f1f3e +.long 0xdd4b4b96,0xdd4b4b96 +.long 0xdcbdbd61,0xdcbdbd61 +.long 0x868b8b0d,0x868b8b0d +.long 0x858a8a0f,0x858a8a0f +.long 0x907070e0,0x907070e0 +.long 0x423e3e7c,0x423e3e7c +.long 0xc4b5b571,0xc4b5b571 +.long 0xaa6666cc,0xaa6666cc +.long 0xd8484890,0xd8484890 +.long 0x05030306,0x05030306 +.long 0x01f6f6f7,0x01f6f6f7 +.long 0x120e0e1c,0x120e0e1c +.long 0xa36161c2,0xa36161c2 +.long 0x5f35356a,0x5f35356a +.long 0xf95757ae,0xf95757ae +.long 0xd0b9b969,0xd0b9b969 +.long 0x91868617,0x91868617 +.long 0x58c1c199,0x58c1c199 +.long 0x271d1d3a,0x271d1d3a +.long 0xb99e9e27,0xb99e9e27 +.long 0x38e1e1d9,0x38e1e1d9 +.long 0x13f8f8eb,0x13f8f8eb +.long 0xb398982b,0xb398982b +.long 0x33111122,0x33111122 +.long 0xbb6969d2,0xbb6969d2 +.long 0x70d9d9a9,0x70d9d9a9 +.long 0x898e8e07,0x898e8e07 +.long 0xa7949433,0xa7949433 +.long 0xb69b9b2d,0xb69b9b2d +.long 0x221e1e3c,0x221e1e3c +.long 0x92878715,0x92878715 +.long 0x20e9e9c9,0x20e9e9c9 +.long 0x49cece87,0x49cece87 +.long 0xff5555aa,0xff5555aa +.long 0x78282850,0x78282850 +.long 0x7adfdfa5,0x7adfdfa5 +.long 0x8f8c8c03,0x8f8c8c03 +.long 0xf8a1a159,0xf8a1a159 +.long 0x80898909,0x80898909 +.long 0x170d0d1a,0x170d0d1a +.long 0xdabfbf65,0xdabfbf65 +.long 0x31e6e6d7,0x31e6e6d7 +.long 0xc6424284,0xc6424284 +.long 0xb86868d0,0xb86868d0 +.long 0xc3414182,0xc3414182 +.long 0xb0999929,0xb0999929 +.long 0x772d2d5a,0x772d2d5a +.long 0x110f0f1e,0x110f0f1e +.long 0xcbb0b07b,0xcbb0b07b +.long 0xfc5454a8,0xfc5454a8 +.long 0xd6bbbb6d,0xd6bbbb6d +.long 0x3a16162c,0x3a16162c +.byte 0x63,0x7c,0x77,0x7b,0xf2,0x6b,0x6f,0xc5 +.byte 0x30,0x01,0x67,0x2b,0xfe,0xd7,0xab,0x76 +.byte 0xca,0x82,0xc9,0x7d,0xfa,0x59,0x47,0xf0 +.byte 0xad,0xd4,0xa2,0xaf,0x9c,0xa4,0x72,0xc0 +.byte 0xb7,0xfd,0x93,0x26,0x36,0x3f,0xf7,0xcc +.byte 0x34,0xa5,0xe5,0xf1,0x71,0xd8,0x31,0x15 +.byte 0x04,0xc7,0x23,0xc3,0x18,0x96,0x05,0x9a +.byte 0x07,0x12,0x80,0xe2,0xeb,0x27,0xb2,0x75 +.byte 0x09,0x83,0x2c,0x1a,0x1b,0x6e,0x5a,0xa0 +.byte 0x52,0x3b,0xd6,0xb3,0x29,0xe3,0x2f,0x84 +.byte 0x53,0xd1,0x00,0xed,0x20,0xfc,0xb1,0x5b +.byte 0x6a,0xcb,0xbe,0x39,0x4a,0x4c,0x58,0xcf +.byte 0xd0,0xef,0xaa,0xfb,0x43,0x4d,0x33,0x85 +.byte 0x45,0xf9,0x02,0x7f,0x50,0x3c,0x9f,0xa8 +.byte 0x51,0xa3,0x40,0x8f,0x92,0x9d,0x38,0xf5 +.byte 0xbc,0xb6,0xda,0x21,0x10,0xff,0xf3,0xd2 +.byte 0xcd,0x0c,0x13,0xec,0x5f,0x97,0x44,0x17 +.byte 0xc4,0xa7,0x7e,0x3d,0x64,0x5d,0x19,0x73 +.byte 0x60,0x81,0x4f,0xdc,0x22,0x2a,0x90,0x88 +.byte 0x46,0xee,0xb8,0x14,0xde,0x5e,0x0b,0xdb +.byte 0xe0,0x32,0x3a,0x0a,0x49,0x06,0x24,0x5c +.byte 0xc2,0xd3,0xac,0x62,0x91,0x95,0xe4,0x79 +.byte 0xe7,0xc8,0x37,0x6d,0x8d,0xd5,0x4e,0xa9 +.byte 0x6c,0x56,0xf4,0xea,0x65,0x7a,0xae,0x08 +.byte 0xba,0x78,0x25,0x2e,0x1c,0xa6,0xb4,0xc6 +.byte 0xe8,0xdd,0x74,0x1f,0x4b,0xbd,0x8b,0x8a +.byte 0x70,0x3e,0xb5,0x66,0x48,0x03,0xf6,0x0e +.byte 0x61,0x35,0x57,0xb9,0x86,0xc1,0x1d,0x9e +.byte 0xe1,0xf8,0x98,0x11,0x69,0xd9,0x8e,0x94 +.byte 0x9b,0x1e,0x87,0xe9,0xce,0x55,0x28,0xdf +.byte 0x8c,0xa1,0x89,0x0d,0xbf,0xe6,0x42,0x68 +.byte 0x41,0x99,0x2d,0x0f,0xb0,0x54,0xbb,0x16 +.byte 0x63,0x7c,0x77,0x7b,0xf2,0x6b,0x6f,0xc5 +.byte 0x30,0x01,0x67,0x2b,0xfe,0xd7,0xab,0x76 +.byte 0xca,0x82,0xc9,0x7d,0xfa,0x59,0x47,0xf0 +.byte 0xad,0xd4,0xa2,0xaf,0x9c,0xa4,0x72,0xc0 +.byte 0xb7,0xfd,0x93,0x26,0x36,0x3f,0xf7,0xcc +.byte 0x34,0xa5,0xe5,0xf1,0x71,0xd8,0x31,0x15 +.byte 0x04,0xc7,0x23,0xc3,0x18,0x96,0x05,0x9a +.byte 0x07,0x12,0x80,0xe2,0xeb,0x27,0xb2,0x75 +.byte 0x09,0x83,0x2c,0x1a,0x1b,0x6e,0x5a,0xa0 +.byte 0x52,0x3b,0xd6,0xb3,0x29,0xe3,0x2f,0x84 +.byte 0x53,0xd1,0x00,0xed,0x20,0xfc,0xb1,0x5b +.byte 0x6a,0xcb,0xbe,0x39,0x4a,0x4c,0x58,0xcf +.byte 0xd0,0xef,0xaa,0xfb,0x43,0x4d,0x33,0x85 +.byte 0x45,0xf9,0x02,0x7f,0x50,0x3c,0x9f,0xa8 +.byte 0x51,0xa3,0x40,0x8f,0x92,0x9d,0x38,0xf5 +.byte 0xbc,0xb6,0xda,0x21,0x10,0xff,0xf3,0xd2 +.byte 0xcd,0x0c,0x13,0xec,0x5f,0x97,0x44,0x17 +.byte 0xc4,0xa7,0x7e,0x3d,0x64,0x5d,0x19,0x73 +.byte 0x60,0x81,0x4f,0xdc,0x22,0x2a,0x90,0x88 +.byte 0x46,0xee,0xb8,0x14,0xde,0x5e,0x0b,0xdb +.byte 0xe0,0x32,0x3a,0x0a,0x49,0x06,0x24,0x5c +.byte 0xc2,0xd3,0xac,0x62,0x91,0x95,0xe4,0x79 +.byte 0xe7,0xc8,0x37,0x6d,0x8d,0xd5,0x4e,0xa9 +.byte 0x6c,0x56,0xf4,0xea,0x65,0x7a,0xae,0x08 +.byte 0xba,0x78,0x25,0x2e,0x1c,0xa6,0xb4,0xc6 +.byte 0xe8,0xdd,0x74,0x1f,0x4b,0xbd,0x8b,0x8a +.byte 0x70,0x3e,0xb5,0x66,0x48,0x03,0xf6,0x0e +.byte 0x61,0x35,0x57,0xb9,0x86,0xc1,0x1d,0x9e +.byte 0xe1,0xf8,0x98,0x11,0x69,0xd9,0x8e,0x94 +.byte 0x9b,0x1e,0x87,0xe9,0xce,0x55,0x28,0xdf +.byte 0x8c,0xa1,0x89,0x0d,0xbf,0xe6,0x42,0x68 +.byte 0x41,0x99,0x2d,0x0f,0xb0,0x54,0xbb,0x16 +.byte 0x63,0x7c,0x77,0x7b,0xf2,0x6b,0x6f,0xc5 +.byte 0x30,0x01,0x67,0x2b,0xfe,0xd7,0xab,0x76 +.byte 0xca,0x82,0xc9,0x7d,0xfa,0x59,0x47,0xf0 +.byte 0xad,0xd4,0xa2,0xaf,0x9c,0xa4,0x72,0xc0 +.byte 0xb7,0xfd,0x93,0x26,0x36,0x3f,0xf7,0xcc +.byte 0x34,0xa5,0xe5,0xf1,0x71,0xd8,0x31,0x15 +.byte 0x04,0xc7,0x23,0xc3,0x18,0x96,0x05,0x9a +.byte 0x07,0x12,0x80,0xe2,0xeb,0x27,0xb2,0x75 +.byte 0x09,0x83,0x2c,0x1a,0x1b,0x6e,0x5a,0xa0 +.byte 0x52,0x3b,0xd6,0xb3,0x29,0xe3,0x2f,0x84 +.byte 0x53,0xd1,0x00,0xed,0x20,0xfc,0xb1,0x5b +.byte 0x6a,0xcb,0xbe,0x39,0x4a,0x4c,0x58,0xcf +.byte 0xd0,0xef,0xaa,0xfb,0x43,0x4d,0x33,0x85 +.byte 0x45,0xf9,0x02,0x7f,0x50,0x3c,0x9f,0xa8 +.byte 0x51,0xa3,0x40,0x8f,0x92,0x9d,0x38,0xf5 +.byte 0xbc,0xb6,0xda,0x21,0x10,0xff,0xf3,0xd2 +.byte 0xcd,0x0c,0x13,0xec,0x5f,0x97,0x44,0x17 +.byte 0xc4,0xa7,0x7e,0x3d,0x64,0x5d,0x19,0x73 +.byte 0x60,0x81,0x4f,0xdc,0x22,0x2a,0x90,0x88 +.byte 0x46,0xee,0xb8,0x14,0xde,0x5e,0x0b,0xdb +.byte 0xe0,0x32,0x3a,0x0a,0x49,0x06,0x24,0x5c +.byte 0xc2,0xd3,0xac,0x62,0x91,0x95,0xe4,0x79 +.byte 0xe7,0xc8,0x37,0x6d,0x8d,0xd5,0x4e,0xa9 +.byte 0x6c,0x56,0xf4,0xea,0x65,0x7a,0xae,0x08 +.byte 0xba,0x78,0x25,0x2e,0x1c,0xa6,0xb4,0xc6 +.byte 0xe8,0xdd,0x74,0x1f,0x4b,0xbd,0x8b,0x8a +.byte 0x70,0x3e,0xb5,0x66,0x48,0x03,0xf6,0x0e +.byte 0x61,0x35,0x57,0xb9,0x86,0xc1,0x1d,0x9e +.byte 0xe1,0xf8,0x98,0x11,0x69,0xd9,0x8e,0x94 +.byte 0x9b,0x1e,0x87,0xe9,0xce,0x55,0x28,0xdf +.byte 0x8c,0xa1,0x89,0x0d,0xbf,0xe6,0x42,0x68 +.byte 0x41,0x99,0x2d,0x0f,0xb0,0x54,0xbb,0x16 +.byte 0x63,0x7c,0x77,0x7b,0xf2,0x6b,0x6f,0xc5 +.byte 0x30,0x01,0x67,0x2b,0xfe,0xd7,0xab,0x76 +.byte 0xca,0x82,0xc9,0x7d,0xfa,0x59,0x47,0xf0 +.byte 0xad,0xd4,0xa2,0xaf,0x9c,0xa4,0x72,0xc0 +.byte 0xb7,0xfd,0x93,0x26,0x36,0x3f,0xf7,0xcc +.byte 0x34,0xa5,0xe5,0xf1,0x71,0xd8,0x31,0x15 +.byte 0x04,0xc7,0x23,0xc3,0x18,0x96,0x05,0x9a +.byte 0x07,0x12,0x80,0xe2,0xeb,0x27,0xb2,0x75 +.byte 0x09,0x83,0x2c,0x1a,0x1b,0x6e,0x5a,0xa0 +.byte 0x52,0x3b,0xd6,0xb3,0x29,0xe3,0x2f,0x84 +.byte 0x53,0xd1,0x00,0xed,0x20,0xfc,0xb1,0x5b +.byte 0x6a,0xcb,0xbe,0x39,0x4a,0x4c,0x58,0xcf +.byte 0xd0,0xef,0xaa,0xfb,0x43,0x4d,0x33,0x85 +.byte 0x45,0xf9,0x02,0x7f,0x50,0x3c,0x9f,0xa8 +.byte 0x51,0xa3,0x40,0x8f,0x92,0x9d,0x38,0xf5 +.byte 0xbc,0xb6,0xda,0x21,0x10,0xff,0xf3,0xd2 +.byte 0xcd,0x0c,0x13,0xec,0x5f,0x97,0x44,0x17 +.byte 0xc4,0xa7,0x7e,0x3d,0x64,0x5d,0x19,0x73 +.byte 0x60,0x81,0x4f,0xdc,0x22,0x2a,0x90,0x88 +.byte 0x46,0xee,0xb8,0x14,0xde,0x5e,0x0b,0xdb +.byte 0xe0,0x32,0x3a,0x0a,0x49,0x06,0x24,0x5c +.byte 0xc2,0xd3,0xac,0x62,0x91,0x95,0xe4,0x79 +.byte 0xe7,0xc8,0x37,0x6d,0x8d,0xd5,0x4e,0xa9 +.byte 0x6c,0x56,0xf4,0xea,0x65,0x7a,0xae,0x08 +.byte 0xba,0x78,0x25,0x2e,0x1c,0xa6,0xb4,0xc6 +.byte 0xe8,0xdd,0x74,0x1f,0x4b,0xbd,0x8b,0x8a +.byte 0x70,0x3e,0xb5,0x66,0x48,0x03,0xf6,0x0e +.byte 0x61,0x35,0x57,0xb9,0x86,0xc1,0x1d,0x9e +.byte 0xe1,0xf8,0x98,0x11,0x69,0xd9,0x8e,0x94 +.byte 0x9b,0x1e,0x87,0xe9,0xce,0x55,0x28,0xdf +.byte 0x8c,0xa1,0x89,0x0d,0xbf,0xe6,0x42,0x68 +.byte 0x41,0x99,0x2d,0x0f,0xb0,0x54,0xbb,0x16 +.long 0x00000001, 0x00000002, 0x00000004, 0x00000008 +.long 0x00000010, 0x00000020, 0x00000040, 0x00000080 +.long 0x0000001b, 0x00000036, 0x80808080, 0x80808080 +.long 0xfefefefe, 0xfefefefe, 0x1b1b1b1b, 0x1b1b1b1b +.align 64 +.LAES_Td: +.long 0x50a7f451,0x50a7f451 +.long 0x5365417e,0x5365417e +.long 0xc3a4171a,0xc3a4171a +.long 0x965e273a,0x965e273a +.long 0xcb6bab3b,0xcb6bab3b +.long 0xf1459d1f,0xf1459d1f +.long 0xab58faac,0xab58faac +.long 0x9303e34b,0x9303e34b +.long 0x55fa3020,0x55fa3020 +.long 0xf66d76ad,0xf66d76ad +.long 0x9176cc88,0x9176cc88 +.long 0x254c02f5,0x254c02f5 +.long 0xfcd7e54f,0xfcd7e54f +.long 0xd7cb2ac5,0xd7cb2ac5 +.long 0x80443526,0x80443526 +.long 0x8fa362b5,0x8fa362b5 +.long 0x495ab1de,0x495ab1de +.long 0x671bba25,0x671bba25 +.long 0x980eea45,0x980eea45 +.long 0xe1c0fe5d,0xe1c0fe5d +.long 0x02752fc3,0x02752fc3 +.long 0x12f04c81,0x12f04c81 +.long 0xa397468d,0xa397468d +.long 0xc6f9d36b,0xc6f9d36b +.long 0xe75f8f03,0xe75f8f03 +.long 0x959c9215,0x959c9215 +.long 0xeb7a6dbf,0xeb7a6dbf +.long 0xda595295,0xda595295 +.long 0x2d83bed4,0x2d83bed4 +.long 0xd3217458,0xd3217458 +.long 0x2969e049,0x2969e049 +.long 0x44c8c98e,0x44c8c98e +.long 0x6a89c275,0x6a89c275 +.long 0x78798ef4,0x78798ef4 +.long 0x6b3e5899,0x6b3e5899 +.long 0xdd71b927,0xdd71b927 +.long 0xb64fe1be,0xb64fe1be +.long 0x17ad88f0,0x17ad88f0 +.long 0x66ac20c9,0x66ac20c9 +.long 0xb43ace7d,0xb43ace7d +.long 0x184adf63,0x184adf63 +.long 0x82311ae5,0x82311ae5 +.long 0x60335197,0x60335197 +.long 0x457f5362,0x457f5362 +.long 0xe07764b1,0xe07764b1 +.long 0x84ae6bbb,0x84ae6bbb +.long 0x1ca081fe,0x1ca081fe +.long 0x942b08f9,0x942b08f9 +.long 0x58684870,0x58684870 +.long 0x19fd458f,0x19fd458f +.long 0x876cde94,0x876cde94 +.long 0xb7f87b52,0xb7f87b52 +.long 0x23d373ab,0x23d373ab +.long 0xe2024b72,0xe2024b72 +.long 0x578f1fe3,0x578f1fe3 +.long 0x2aab5566,0x2aab5566 +.long 0x0728ebb2,0x0728ebb2 +.long 0x03c2b52f,0x03c2b52f +.long 0x9a7bc586,0x9a7bc586 +.long 0xa50837d3,0xa50837d3 +.long 0xf2872830,0xf2872830 +.long 0xb2a5bf23,0xb2a5bf23 +.long 0xba6a0302,0xba6a0302 +.long 0x5c8216ed,0x5c8216ed +.long 0x2b1ccf8a,0x2b1ccf8a +.long 0x92b479a7,0x92b479a7 +.long 0xf0f207f3,0xf0f207f3 +.long 0xa1e2694e,0xa1e2694e +.long 0xcdf4da65,0xcdf4da65 +.long 0xd5be0506,0xd5be0506 +.long 0x1f6234d1,0x1f6234d1 +.long 0x8afea6c4,0x8afea6c4 +.long 0x9d532e34,0x9d532e34 +.long 0xa055f3a2,0xa055f3a2 +.long 0x32e18a05,0x32e18a05 +.long 0x75ebf6a4,0x75ebf6a4 +.long 0x39ec830b,0x39ec830b +.long 0xaaef6040,0xaaef6040 +.long 0x069f715e,0x069f715e +.long 0x51106ebd,0x51106ebd +.long 0xf98a213e,0xf98a213e +.long 0x3d06dd96,0x3d06dd96 +.long 0xae053edd,0xae053edd +.long 0x46bde64d,0x46bde64d +.long 0xb58d5491,0xb58d5491 +.long 0x055dc471,0x055dc471 +.long 0x6fd40604,0x6fd40604 +.long 0xff155060,0xff155060 +.long 0x24fb9819,0x24fb9819 +.long 0x97e9bdd6,0x97e9bdd6 +.long 0xcc434089,0xcc434089 +.long 0x779ed967,0x779ed967 +.long 0xbd42e8b0,0xbd42e8b0 +.long 0x888b8907,0x888b8907 +.long 0x385b19e7,0x385b19e7 +.long 0xdbeec879,0xdbeec879 +.long 0x470a7ca1,0x470a7ca1 +.long 0xe90f427c,0xe90f427c +.long 0xc91e84f8,0xc91e84f8 +.long 0x00000000,0x00000000 +.long 0x83868009,0x83868009 +.long 0x48ed2b32,0x48ed2b32 +.long 0xac70111e,0xac70111e +.long 0x4e725a6c,0x4e725a6c +.long 0xfbff0efd,0xfbff0efd +.long 0x5638850f,0x5638850f +.long 0x1ed5ae3d,0x1ed5ae3d +.long 0x27392d36,0x27392d36 +.long 0x64d90f0a,0x64d90f0a +.long 0x21a65c68,0x21a65c68 +.long 0xd1545b9b,0xd1545b9b +.long 0x3a2e3624,0x3a2e3624 +.long 0xb1670a0c,0xb1670a0c +.long 0x0fe75793,0x0fe75793 +.long 0xd296eeb4,0xd296eeb4 +.long 0x9e919b1b,0x9e919b1b +.long 0x4fc5c080,0x4fc5c080 +.long 0xa220dc61,0xa220dc61 +.long 0x694b775a,0x694b775a +.long 0x161a121c,0x161a121c +.long 0x0aba93e2,0x0aba93e2 +.long 0xe52aa0c0,0xe52aa0c0 +.long 0x43e0223c,0x43e0223c +.long 0x1d171b12,0x1d171b12 +.long 0x0b0d090e,0x0b0d090e +.long 0xadc78bf2,0xadc78bf2 +.long 0xb9a8b62d,0xb9a8b62d +.long 0xc8a91e14,0xc8a91e14 +.long 0x8519f157,0x8519f157 +.long 0x4c0775af,0x4c0775af +.long 0xbbdd99ee,0xbbdd99ee +.long 0xfd607fa3,0xfd607fa3 +.long 0x9f2601f7,0x9f2601f7 +.long 0xbcf5725c,0xbcf5725c +.long 0xc53b6644,0xc53b6644 +.long 0x347efb5b,0x347efb5b +.long 0x7629438b,0x7629438b +.long 0xdcc623cb,0xdcc623cb +.long 0x68fcedb6,0x68fcedb6 +.long 0x63f1e4b8,0x63f1e4b8 +.long 0xcadc31d7,0xcadc31d7 +.long 0x10856342,0x10856342 +.long 0x40229713,0x40229713 +.long 0x2011c684,0x2011c684 +.long 0x7d244a85,0x7d244a85 +.long 0xf83dbbd2,0xf83dbbd2 +.long 0x1132f9ae,0x1132f9ae +.long 0x6da129c7,0x6da129c7 +.long 0x4b2f9e1d,0x4b2f9e1d +.long 0xf330b2dc,0xf330b2dc +.long 0xec52860d,0xec52860d +.long 0xd0e3c177,0xd0e3c177 +.long 0x6c16b32b,0x6c16b32b +.long 0x99b970a9,0x99b970a9 +.long 0xfa489411,0xfa489411 +.long 0x2264e947,0x2264e947 +.long 0xc48cfca8,0xc48cfca8 +.long 0x1a3ff0a0,0x1a3ff0a0 +.long 0xd82c7d56,0xd82c7d56 +.long 0xef903322,0xef903322 +.long 0xc74e4987,0xc74e4987 +.long 0xc1d138d9,0xc1d138d9 +.long 0xfea2ca8c,0xfea2ca8c +.long 0x360bd498,0x360bd498 +.long 0xcf81f5a6,0xcf81f5a6 +.long 0x28de7aa5,0x28de7aa5 +.long 0x268eb7da,0x268eb7da +.long 0xa4bfad3f,0xa4bfad3f +.long 0xe49d3a2c,0xe49d3a2c +.long 0x0d927850,0x0d927850 +.long 0x9bcc5f6a,0x9bcc5f6a +.long 0x62467e54,0x62467e54 +.long 0xc2138df6,0xc2138df6 +.long 0xe8b8d890,0xe8b8d890 +.long 0x5ef7392e,0x5ef7392e +.long 0xf5afc382,0xf5afc382 +.long 0xbe805d9f,0xbe805d9f +.long 0x7c93d069,0x7c93d069 +.long 0xa92dd56f,0xa92dd56f +.long 0xb31225cf,0xb31225cf +.long 0x3b99acc8,0x3b99acc8 +.long 0xa77d1810,0xa77d1810 +.long 0x6e639ce8,0x6e639ce8 +.long 0x7bbb3bdb,0x7bbb3bdb +.long 0x097826cd,0x097826cd +.long 0xf418596e,0xf418596e +.long 0x01b79aec,0x01b79aec +.long 0xa89a4f83,0xa89a4f83 +.long 0x656e95e6,0x656e95e6 +.long 0x7ee6ffaa,0x7ee6ffaa +.long 0x08cfbc21,0x08cfbc21 +.long 0xe6e815ef,0xe6e815ef +.long 0xd99be7ba,0xd99be7ba +.long 0xce366f4a,0xce366f4a +.long 0xd4099fea,0xd4099fea +.long 0xd67cb029,0xd67cb029 +.long 0xafb2a431,0xafb2a431 +.long 0x31233f2a,0x31233f2a +.long 0x3094a5c6,0x3094a5c6 +.long 0xc066a235,0xc066a235 +.long 0x37bc4e74,0x37bc4e74 +.long 0xa6ca82fc,0xa6ca82fc +.long 0xb0d090e0,0xb0d090e0 +.long 0x15d8a733,0x15d8a733 +.long 0x4a9804f1,0x4a9804f1 +.long 0xf7daec41,0xf7daec41 +.long 0x0e50cd7f,0x0e50cd7f +.long 0x2ff69117,0x2ff69117 +.long 0x8dd64d76,0x8dd64d76 +.long 0x4db0ef43,0x4db0ef43 +.long 0x544daacc,0x544daacc +.long 0xdf0496e4,0xdf0496e4 +.long 0xe3b5d19e,0xe3b5d19e +.long 0x1b886a4c,0x1b886a4c +.long 0xb81f2cc1,0xb81f2cc1 +.long 0x7f516546,0x7f516546 +.long 0x04ea5e9d,0x04ea5e9d +.long 0x5d358c01,0x5d358c01 +.long 0x737487fa,0x737487fa +.long 0x2e410bfb,0x2e410bfb +.long 0x5a1d67b3,0x5a1d67b3 +.long 0x52d2db92,0x52d2db92 +.long 0x335610e9,0x335610e9 +.long 0x1347d66d,0x1347d66d +.long 0x8c61d79a,0x8c61d79a +.long 0x7a0ca137,0x7a0ca137 +.long 0x8e14f859,0x8e14f859 +.long 0x893c13eb,0x893c13eb +.long 0xee27a9ce,0xee27a9ce +.long 0x35c961b7,0x35c961b7 +.long 0xede51ce1,0xede51ce1 +.long 0x3cb1477a,0x3cb1477a +.long 0x59dfd29c,0x59dfd29c +.long 0x3f73f255,0x3f73f255 +.long 0x79ce1418,0x79ce1418 +.long 0xbf37c773,0xbf37c773 +.long 0xeacdf753,0xeacdf753 +.long 0x5baafd5f,0x5baafd5f +.long 0x146f3ddf,0x146f3ddf +.long 0x86db4478,0x86db4478 +.long 0x81f3afca,0x81f3afca +.long 0x3ec468b9,0x3ec468b9 +.long 0x2c342438,0x2c342438 +.long 0x5f40a3c2,0x5f40a3c2 +.long 0x72c31d16,0x72c31d16 +.long 0x0c25e2bc,0x0c25e2bc +.long 0x8b493c28,0x8b493c28 +.long 0x41950dff,0x41950dff +.long 0x7101a839,0x7101a839 +.long 0xdeb30c08,0xdeb30c08 +.long 0x9ce4b4d8,0x9ce4b4d8 +.long 0x90c15664,0x90c15664 +.long 0x6184cb7b,0x6184cb7b +.long 0x70b632d5,0x70b632d5 +.long 0x745c6c48,0x745c6c48 +.long 0x4257b8d0,0x4257b8d0 +.byte 0x52,0x09,0x6a,0xd5,0x30,0x36,0xa5,0x38 +.byte 0xbf,0x40,0xa3,0x9e,0x81,0xf3,0xd7,0xfb +.byte 0x7c,0xe3,0x39,0x82,0x9b,0x2f,0xff,0x87 +.byte 0x34,0x8e,0x43,0x44,0xc4,0xde,0xe9,0xcb +.byte 0x54,0x7b,0x94,0x32,0xa6,0xc2,0x23,0x3d +.byte 0xee,0x4c,0x95,0x0b,0x42,0xfa,0xc3,0x4e +.byte 0x08,0x2e,0xa1,0x66,0x28,0xd9,0x24,0xb2 +.byte 0x76,0x5b,0xa2,0x49,0x6d,0x8b,0xd1,0x25 +.byte 0x72,0xf8,0xf6,0x64,0x86,0x68,0x98,0x16 +.byte 0xd4,0xa4,0x5c,0xcc,0x5d,0x65,0xb6,0x92 +.byte 0x6c,0x70,0x48,0x50,0xfd,0xed,0xb9,0xda +.byte 0x5e,0x15,0x46,0x57,0xa7,0x8d,0x9d,0x84 +.byte 0x90,0xd8,0xab,0x00,0x8c,0xbc,0xd3,0x0a +.byte 0xf7,0xe4,0x58,0x05,0xb8,0xb3,0x45,0x06 +.byte 0xd0,0x2c,0x1e,0x8f,0xca,0x3f,0x0f,0x02 +.byte 0xc1,0xaf,0xbd,0x03,0x01,0x13,0x8a,0x6b +.byte 0x3a,0x91,0x11,0x41,0x4f,0x67,0xdc,0xea +.byte 0x97,0xf2,0xcf,0xce,0xf0,0xb4,0xe6,0x73 +.byte 0x96,0xac,0x74,0x22,0xe7,0xad,0x35,0x85 +.byte 0xe2,0xf9,0x37,0xe8,0x1c,0x75,0xdf,0x6e +.byte 0x47,0xf1,0x1a,0x71,0x1d,0x29,0xc5,0x89 +.byte 0x6f,0xb7,0x62,0x0e,0xaa,0x18,0xbe,0x1b +.byte 0xfc,0x56,0x3e,0x4b,0xc6,0xd2,0x79,0x20 +.byte 0x9a,0xdb,0xc0,0xfe,0x78,0xcd,0x5a,0xf4 +.byte 0x1f,0xdd,0xa8,0x33,0x88,0x07,0xc7,0x31 +.byte 0xb1,0x12,0x10,0x59,0x27,0x80,0xec,0x5f +.byte 0x60,0x51,0x7f,0xa9,0x19,0xb5,0x4a,0x0d +.byte 0x2d,0xe5,0x7a,0x9f,0x93,0xc9,0x9c,0xef +.byte 0xa0,0xe0,0x3b,0x4d,0xae,0x2a,0xf5,0xb0 +.byte 0xc8,0xeb,0xbb,0x3c,0x83,0x53,0x99,0x61 +.byte 0x17,0x2b,0x04,0x7e,0xba,0x77,0xd6,0x26 +.byte 0xe1,0x69,0x14,0x63,0x55,0x21,0x0c,0x7d +.long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe +.long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0 +.byte 0x52,0x09,0x6a,0xd5,0x30,0x36,0xa5,0x38 +.byte 0xbf,0x40,0xa3,0x9e,0x81,0xf3,0xd7,0xfb +.byte 0x7c,0xe3,0x39,0x82,0x9b,0x2f,0xff,0x87 +.byte 0x34,0x8e,0x43,0x44,0xc4,0xde,0xe9,0xcb +.byte 0x54,0x7b,0x94,0x32,0xa6,0xc2,0x23,0x3d +.byte 0xee,0x4c,0x95,0x0b,0x42,0xfa,0xc3,0x4e +.byte 0x08,0x2e,0xa1,0x66,0x28,0xd9,0x24,0xb2 +.byte 0x76,0x5b,0xa2,0x49,0x6d,0x8b,0xd1,0x25 +.byte 0x72,0xf8,0xf6,0x64,0x86,0x68,0x98,0x16 +.byte 0xd4,0xa4,0x5c,0xcc,0x5d,0x65,0xb6,0x92 +.byte 0x6c,0x70,0x48,0x50,0xfd,0xed,0xb9,0xda +.byte 0x5e,0x15,0x46,0x57,0xa7,0x8d,0x9d,0x84 +.byte 0x90,0xd8,0xab,0x00,0x8c,0xbc,0xd3,0x0a +.byte 0xf7,0xe4,0x58,0x05,0xb8,0xb3,0x45,0x06 +.byte 0xd0,0x2c,0x1e,0x8f,0xca,0x3f,0x0f,0x02 +.byte 0xc1,0xaf,0xbd,0x03,0x01,0x13,0x8a,0x6b +.byte 0x3a,0x91,0x11,0x41,0x4f,0x67,0xdc,0xea +.byte 0x97,0xf2,0xcf,0xce,0xf0,0xb4,0xe6,0x73 +.byte 0x96,0xac,0x74,0x22,0xe7,0xad,0x35,0x85 +.byte 0xe2,0xf9,0x37,0xe8,0x1c,0x75,0xdf,0x6e +.byte 0x47,0xf1,0x1a,0x71,0x1d,0x29,0xc5,0x89 +.byte 0x6f,0xb7,0x62,0x0e,0xaa,0x18,0xbe,0x1b +.byte 0xfc,0x56,0x3e,0x4b,0xc6,0xd2,0x79,0x20 +.byte 0x9a,0xdb,0xc0,0xfe,0x78,0xcd,0x5a,0xf4 +.byte 0x1f,0xdd,0xa8,0x33,0x88,0x07,0xc7,0x31 +.byte 0xb1,0x12,0x10,0x59,0x27,0x80,0xec,0x5f +.byte 0x60,0x51,0x7f,0xa9,0x19,0xb5,0x4a,0x0d +.byte 0x2d,0xe5,0x7a,0x9f,0x93,0xc9,0x9c,0xef +.byte 0xa0,0xe0,0x3b,0x4d,0xae,0x2a,0xf5,0xb0 +.byte 0xc8,0xeb,0xbb,0x3c,0x83,0x53,0x99,0x61 +.byte 0x17,0x2b,0x04,0x7e,0xba,0x77,0xd6,0x26 +.byte 0xe1,0x69,0x14,0x63,0x55,0x21,0x0c,0x7d +.long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe +.long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0 +.byte 0x52,0x09,0x6a,0xd5,0x30,0x36,0xa5,0x38 +.byte 0xbf,0x40,0xa3,0x9e,0x81,0xf3,0xd7,0xfb +.byte 0x7c,0xe3,0x39,0x82,0x9b,0x2f,0xff,0x87 +.byte 0x34,0x8e,0x43,0x44,0xc4,0xde,0xe9,0xcb +.byte 0x54,0x7b,0x94,0x32,0xa6,0xc2,0x23,0x3d +.byte 0xee,0x4c,0x95,0x0b,0x42,0xfa,0xc3,0x4e +.byte 0x08,0x2e,0xa1,0x66,0x28,0xd9,0x24,0xb2 +.byte 0x76,0x5b,0xa2,0x49,0x6d,0x8b,0xd1,0x25 +.byte 0x72,0xf8,0xf6,0x64,0x86,0x68,0x98,0x16 +.byte 0xd4,0xa4,0x5c,0xcc,0x5d,0x65,0xb6,0x92 +.byte 0x6c,0x70,0x48,0x50,0xfd,0xed,0xb9,0xda +.byte 0x5e,0x15,0x46,0x57,0xa7,0x8d,0x9d,0x84 +.byte 0x90,0xd8,0xab,0x00,0x8c,0xbc,0xd3,0x0a +.byte 0xf7,0xe4,0x58,0x05,0xb8,0xb3,0x45,0x06 +.byte 0xd0,0x2c,0x1e,0x8f,0xca,0x3f,0x0f,0x02 +.byte 0xc1,0xaf,0xbd,0x03,0x01,0x13,0x8a,0x6b +.byte 0x3a,0x91,0x11,0x41,0x4f,0x67,0xdc,0xea +.byte 0x97,0xf2,0xcf,0xce,0xf0,0xb4,0xe6,0x73 +.byte 0x96,0xac,0x74,0x22,0xe7,0xad,0x35,0x85 +.byte 0xe2,0xf9,0x37,0xe8,0x1c,0x75,0xdf,0x6e +.byte 0x47,0xf1,0x1a,0x71,0x1d,0x29,0xc5,0x89 +.byte 0x6f,0xb7,0x62,0x0e,0xaa,0x18,0xbe,0x1b +.byte 0xfc,0x56,0x3e,0x4b,0xc6,0xd2,0x79,0x20 +.byte 0x9a,0xdb,0xc0,0xfe,0x78,0xcd,0x5a,0xf4 +.byte 0x1f,0xdd,0xa8,0x33,0x88,0x07,0xc7,0x31 +.byte 0xb1,0x12,0x10,0x59,0x27,0x80,0xec,0x5f +.byte 0x60,0x51,0x7f,0xa9,0x19,0xb5,0x4a,0x0d +.byte 0x2d,0xe5,0x7a,0x9f,0x93,0xc9,0x9c,0xef +.byte 0xa0,0xe0,0x3b,0x4d,0xae,0x2a,0xf5,0xb0 +.byte 0xc8,0xeb,0xbb,0x3c,0x83,0x53,0x99,0x61 +.byte 0x17,0x2b,0x04,0x7e,0xba,0x77,0xd6,0x26 +.byte 0xe1,0x69,0x14,0x63,0x55,0x21,0x0c,0x7d +.long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe +.long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0 +.byte 0x52,0x09,0x6a,0xd5,0x30,0x36,0xa5,0x38 +.byte 0xbf,0x40,0xa3,0x9e,0x81,0xf3,0xd7,0xfb +.byte 0x7c,0xe3,0x39,0x82,0x9b,0x2f,0xff,0x87 +.byte 0x34,0x8e,0x43,0x44,0xc4,0xde,0xe9,0xcb +.byte 0x54,0x7b,0x94,0x32,0xa6,0xc2,0x23,0x3d +.byte 0xee,0x4c,0x95,0x0b,0x42,0xfa,0xc3,0x4e +.byte 0x08,0x2e,0xa1,0x66,0x28,0xd9,0x24,0xb2 +.byte 0x76,0x5b,0xa2,0x49,0x6d,0x8b,0xd1,0x25 +.byte 0x72,0xf8,0xf6,0x64,0x86,0x68,0x98,0x16 +.byte 0xd4,0xa4,0x5c,0xcc,0x5d,0x65,0xb6,0x92 +.byte 0x6c,0x70,0x48,0x50,0xfd,0xed,0xb9,0xda +.byte 0x5e,0x15,0x46,0x57,0xa7,0x8d,0x9d,0x84 +.byte 0x90,0xd8,0xab,0x00,0x8c,0xbc,0xd3,0x0a +.byte 0xf7,0xe4,0x58,0x05,0xb8,0xb3,0x45,0x06 +.byte 0xd0,0x2c,0x1e,0x8f,0xca,0x3f,0x0f,0x02 +.byte 0xc1,0xaf,0xbd,0x03,0x01,0x13,0x8a,0x6b +.byte 0x3a,0x91,0x11,0x41,0x4f,0x67,0xdc,0xea +.byte 0x97,0xf2,0xcf,0xce,0xf0,0xb4,0xe6,0x73 +.byte 0x96,0xac,0x74,0x22,0xe7,0xad,0x35,0x85 +.byte 0xe2,0xf9,0x37,0xe8,0x1c,0x75,0xdf,0x6e +.byte 0x47,0xf1,0x1a,0x71,0x1d,0x29,0xc5,0x89 +.byte 0x6f,0xb7,0x62,0x0e,0xaa,0x18,0xbe,0x1b +.byte 0xfc,0x56,0x3e,0x4b,0xc6,0xd2,0x79,0x20 +.byte 0x9a,0xdb,0xc0,0xfe,0x78,0xcd,0x5a,0xf4 +.byte 0x1f,0xdd,0xa8,0x33,0x88,0x07,0xc7,0x31 +.byte 0xb1,0x12,0x10,0x59,0x27,0x80,0xec,0x5f +.byte 0x60,0x51,0x7f,0xa9,0x19,0xb5,0x4a,0x0d +.byte 0x2d,0xe5,0x7a,0x9f,0x93,0xc9,0x9c,0xef +.byte 0xa0,0xe0,0x3b,0x4d,0xae,0x2a,0xf5,0xb0 +.byte 0xc8,0xeb,0xbb,0x3c,0x83,0x53,0x99,0x61 +.byte 0x17,0x2b,0x04,0x7e,0xba,0x77,0xd6,0x26 +.byte 0xe1,0x69,0x14,0x63,0x55,0x21,0x0c,0x7d +.long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe +.long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0 +.byte 65,69,83,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 64 + .section ".note.gnu.property", "a" + .p2align 3 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + # "GNU" encoded with .byte, since .asciz isn't supported + # on Solaris. + .byte 0x47 + .byte 0x4e + .byte 0x55 + .byte 0 +1: + .p2align 3 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 3 +4: diff --git a/sys/crypto/openssl/amd64/aesni-gcm-x86_64.S b/sys/crypto/openssl/amd64/aesni-gcm-x86_64.S --- a/sys/crypto/openssl/amd64/aesni-gcm-x86_64.S +++ b/sys/crypto/openssl/amd64/aesni-gcm-x86_64.S @@ -788,3 +788,24 @@ .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 .byte 65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 64 + .section ".note.gnu.property", "a" + .p2align 3 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + # "GNU" encoded with .byte, since .asciz isn't supported + # on Solaris. + .byte 0x47 + .byte 0x4e + .byte 0x55 + .byte 0 +1: + .p2align 3 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 3 +4: diff --git a/sys/crypto/openssl/amd64/aesni-mb-x86_64.S b/sys/crypto/openssl/amd64/aesni-mb-x86_64.S --- a/sys/crypto/openssl/amd64/aesni-mb-x86_64.S +++ b/sys/crypto/openssl/amd64/aesni-mb-x86_64.S @@ -49,39 +49,47 @@ .Lenc4x_loop_grande: movl %edx,24(%rsp) xorl %edx,%edx + movl -64(%rdi),%ecx movq -80(%rdi),%r8 cmpl %edx,%ecx movq -72(%rdi),%r12 cmovgl %ecx,%edx testl %ecx,%ecx + movdqu -56(%rdi),%xmm2 movl %ecx,32(%rsp) cmovleq %rsp,%r8 + movl -24(%rdi),%ecx movq -40(%rdi),%r9 cmpl %edx,%ecx movq -32(%rdi),%r13 cmovgl %ecx,%edx testl %ecx,%ecx + movdqu -16(%rdi),%xmm3 movl %ecx,36(%rsp) cmovleq %rsp,%r9 + movl 16(%rdi),%ecx movq 0(%rdi),%r10 cmpl %edx,%ecx movq 8(%rdi),%r14 cmovgl %ecx,%edx testl %ecx,%ecx + movdqu 24(%rdi),%xmm4 movl %ecx,40(%rsp) cmovleq %rsp,%r10 + movl 56(%rdi),%ecx movq 40(%rdi),%r11 cmpl %edx,%ecx movq 48(%rdi),%r15 cmovgl %ecx,%edx testl %ecx,%ecx + movdqu 64(%rdi),%xmm5 movl %ecx,44(%rsp) cmovleq %rsp,%r11 @@ -261,6 +269,7 @@ + leaq 160(%rdi),%rdi decl %edx jnz .Lenc4x_loop_grande @@ -331,39 +340,47 @@ .Ldec4x_loop_grande: movl %edx,24(%rsp) xorl %edx,%edx + movl -64(%rdi),%ecx movq -80(%rdi),%r8 cmpl %edx,%ecx movq -72(%rdi),%r12 cmovgl %ecx,%edx testl %ecx,%ecx + movdqu -56(%rdi),%xmm6 movl %ecx,32(%rsp) cmovleq %rsp,%r8 + movl -24(%rdi),%ecx movq -40(%rdi),%r9 cmpl %edx,%ecx movq -32(%rdi),%r13 cmovgl %ecx,%edx testl %ecx,%ecx + movdqu -16(%rdi),%xmm7 movl %ecx,36(%rsp) cmovleq %rsp,%r9 + movl 16(%rdi),%ecx movq 0(%rdi),%r10 cmpl %edx,%ecx movq 8(%rdi),%r14 cmovgl %ecx,%edx testl %ecx,%ecx + movdqu 24(%rdi),%xmm8 movl %ecx,40(%rsp) cmovleq %rsp,%r10 + movl 56(%rdi),%ecx movq 40(%rdi),%r11 cmpl %edx,%ecx movq 48(%rdi),%r15 cmovgl %ecx,%edx testl %ecx,%ecx + movdqu 64(%rdi),%xmm9 movl %ecx,44(%rsp) cmovleq %rsp,%r11 @@ -599,89 +616,121 @@ .Lenc8x_loop_grande: xorl %edx,%edx + movl -144(%rdi),%ecx + movq -160(%rdi),%r8 cmpl %edx,%ecx + movq -152(%rdi),%rbx cmovgl %ecx,%edx testl %ecx,%ecx + vmovdqu -136(%rdi),%xmm2 movl %ecx,32(%rsp) cmovleq %rsp,%r8 subq %r8,%rbx movq %rbx,64(%rsp) + movl -104(%rdi),%ecx + movq -120(%rdi),%r9 cmpl %edx,%ecx + movq -112(%rdi),%rbp cmovgl %ecx,%edx testl %ecx,%ecx + vmovdqu -96(%rdi),%xmm3 movl %ecx,36(%rsp) cmovleq %rsp,%r9 subq %r9,%rbp movq %rbp,72(%rsp) + movl -64(%rdi),%ecx + movq -80(%rdi),%r10 cmpl %edx,%ecx + movq -72(%rdi),%rbp cmovgl %ecx,%edx testl %ecx,%ecx + vmovdqu -56(%rdi),%xmm4 movl %ecx,40(%rsp) cmovleq %rsp,%r10 subq %r10,%rbp movq %rbp,80(%rsp) + movl -24(%rdi),%ecx + movq -40(%rdi),%r11 cmpl %edx,%ecx + movq -32(%rdi),%rbp cmovgl %ecx,%edx testl %ecx,%ecx + vmovdqu -16(%rdi),%xmm5 movl %ecx,44(%rsp) cmovleq %rsp,%r11 subq %r11,%rbp movq %rbp,88(%rsp) + movl 16(%rdi),%ecx + movq 0(%rdi),%r12 cmpl %edx,%ecx + movq 8(%rdi),%rbp cmovgl %ecx,%edx testl %ecx,%ecx + vmovdqu 24(%rdi),%xmm6 movl %ecx,48(%rsp) cmovleq %rsp,%r12 subq %r12,%rbp movq %rbp,96(%rsp) + movl 56(%rdi),%ecx + movq 40(%rdi),%r13 cmpl %edx,%ecx + movq 48(%rdi),%rbp cmovgl %ecx,%edx testl %ecx,%ecx + vmovdqu 64(%rdi),%xmm7 movl %ecx,52(%rsp) cmovleq %rsp,%r13 subq %r13,%rbp movq %rbp,104(%rsp) + movl 96(%rdi),%ecx + movq 80(%rdi),%r14 cmpl %edx,%ecx + movq 88(%rdi),%rbp cmovgl %ecx,%edx testl %ecx,%ecx + vmovdqu 104(%rdi),%xmm8 movl %ecx,56(%rsp) cmovleq %rsp,%r14 subq %r14,%rbp movq %rbp,112(%rsp) + movl 136(%rdi),%ecx + movq 120(%rdi),%r15 cmpl %edx,%ecx + movq 128(%rdi),%rbp cmovgl %ecx,%edx testl %ecx,%ecx + vmovdqu 144(%rdi),%xmm9 movl %ecx,60(%rsp) cmovleq %rsp,%r15 @@ -1056,96 +1105,128 @@ .Ldec8x_loop_grande: xorl %edx,%edx + movl -144(%rdi),%ecx + movq -160(%rdi),%r8 cmpl %edx,%ecx + movq -152(%rdi),%rbx cmovgl %ecx,%edx testl %ecx,%ecx + vmovdqu -136(%rdi),%xmm2 movl %ecx,32(%rsp) cmovleq %rsp,%r8 subq %r8,%rbx movq %rbx,64(%rsp) vmovdqu %xmm2,192(%rsp) + movl -104(%rdi),%ecx + movq -120(%rdi),%r9 cmpl %edx,%ecx + movq -112(%rdi),%rbp cmovgl %ecx,%edx testl %ecx,%ecx + vmovdqu -96(%rdi),%xmm3 movl %ecx,36(%rsp) cmovleq %rsp,%r9 subq %r9,%rbp movq %rbp,72(%rsp) vmovdqu %xmm3,208(%rsp) + movl -64(%rdi),%ecx + movq -80(%rdi),%r10 cmpl %edx,%ecx + movq -72(%rdi),%rbp cmovgl %ecx,%edx testl %ecx,%ecx + vmovdqu -56(%rdi),%xmm4 movl %ecx,40(%rsp) cmovleq %rsp,%r10 subq %r10,%rbp movq %rbp,80(%rsp) vmovdqu %xmm4,224(%rsp) + movl -24(%rdi),%ecx + movq -40(%rdi),%r11 cmpl %edx,%ecx + movq -32(%rdi),%rbp cmovgl %ecx,%edx testl %ecx,%ecx + vmovdqu -16(%rdi),%xmm5 movl %ecx,44(%rsp) cmovleq %rsp,%r11 subq %r11,%rbp movq %rbp,88(%rsp) vmovdqu %xmm5,240(%rsp) + movl 16(%rdi),%ecx + movq 0(%rdi),%r12 cmpl %edx,%ecx + movq 8(%rdi),%rbp cmovgl %ecx,%edx testl %ecx,%ecx + vmovdqu 24(%rdi),%xmm6 movl %ecx,48(%rsp) cmovleq %rsp,%r12 subq %r12,%rbp movq %rbp,96(%rsp) vmovdqu %xmm6,256(%rsp) + movl 56(%rdi),%ecx + movq 40(%rdi),%r13 cmpl %edx,%ecx + movq 48(%rdi),%rbp cmovgl %ecx,%edx testl %ecx,%ecx + vmovdqu 64(%rdi),%xmm7 movl %ecx,52(%rsp) cmovleq %rsp,%r13 subq %r13,%rbp movq %rbp,104(%rsp) vmovdqu %xmm7,272(%rsp) + movl 96(%rdi),%ecx + movq 80(%rdi),%r14 cmpl %edx,%ecx + movq 88(%rdi),%rbp cmovgl %ecx,%edx testl %ecx,%ecx + vmovdqu 104(%rdi),%xmm8 movl %ecx,56(%rsp) cmovleq %rsp,%r14 subq %r14,%rbp movq %rbp,112(%rsp) vmovdqu %xmm8,288(%rsp) + movl 136(%rdi),%ecx + movq 120(%rdi),%r15 cmpl %edx,%ecx + movq 128(%rdi),%rbp cmovgl %ecx,%edx testl %ecx,%ecx + vmovdqu 144(%rdi),%xmm9 movl %ecx,60(%rsp) cmovleq %rsp,%r15 @@ -1506,3 +1587,24 @@ .byte 0xf3,0xc3 .cfi_endproc .size aesni_multi_cbc_decrypt_avx,.-aesni_multi_cbc_decrypt_avx + .section ".note.gnu.property", "a" + .p2align 3 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + # "GNU" encoded with .byte, since .asciz isn't supported + # on Solaris. + .byte 0x47 + .byte 0x4e + .byte 0x55 + .byte 0 +1: + .p2align 3 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 3 +4: diff --git a/sys/crypto/openssl/amd64/aesni-sha1-x86_64.S b/sys/crypto/openssl/amd64/aesni-sha1-x86_64.S --- a/sys/crypto/openssl/amd64/aesni-sha1-x86_64.S +++ b/sys/crypto/openssl/amd64/aesni-sha1-x86_64.S @@ -3034,3 +3034,24 @@ .byte 0xf3,0xc3 .cfi_endproc .size aesni_cbc_sha1_enc_shaext,.-aesni_cbc_sha1_enc_shaext + .section ".note.gnu.property", "a" + .p2align 3 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + # "GNU" encoded with .byte, since .asciz isn't supported + # on Solaris. + .byte 0x47 + .byte 0x4e + .byte 0x55 + .byte 0 +1: + .p2align 3 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 3 +4: diff --git a/sys/crypto/openssl/amd64/aesni-sha256-x86_64.S b/sys/crypto/openssl/amd64/aesni-sha256-x86_64.S --- a/sys/crypto/openssl/amd64/aesni-sha256-x86_64.S +++ b/sys/crypto/openssl/amd64/aesni-sha256-x86_64.S @@ -4434,3 +4434,24 @@ .byte 0xf3,0xc3 .cfi_endproc .size aesni_cbc_sha256_enc_shaext,.-aesni_cbc_sha256_enc_shaext + .section ".note.gnu.property", "a" + .p2align 3 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + # "GNU" encoded with .byte, since .asciz isn't supported + # on Solaris. + .byte 0x47 + .byte 0x4e + .byte 0x55 + .byte 0 +1: + .p2align 3 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 3 +4: diff --git a/sys/crypto/openssl/amd64/aesni-x86_64.S b/sys/crypto/openssl/amd64/aesni-x86_64.S --- a/sys/crypto/openssl/amd64/aesni-x86_64.S +++ b/sys/crypto/openssl/amd64/aesni-x86_64.S @@ -6,6 +6,7 @@ .align 16 aesni_encrypt: .cfi_startproc +.byte 243,15,30,250 movups (%rdi),%xmm2 movl 240(%rdx),%eax movups (%rdx),%xmm0 @@ -32,6 +33,7 @@ .align 16 aesni_decrypt: .cfi_startproc +.byte 243,15,30,250 movups (%rdi),%xmm2 movl 240(%rdx),%eax movups (%rdx),%xmm0 @@ -519,6 +521,7 @@ .align 16 aesni_ecb_encrypt: .cfi_startproc +.byte 243,15,30,250 andq $-16,%rdx jz .Lecb_ret @@ -863,6 +866,7 @@ .align 16 aesni_ccm64_encrypt_blocks: .cfi_startproc +.byte 243,15,30,250 movl 240(%rcx),%eax movdqu (%r8),%xmm6 movdqa .Lincrement64(%rip),%xmm9 @@ -928,6 +932,7 @@ .align 16 aesni_ccm64_decrypt_blocks: .cfi_startproc +.byte 243,15,30,250 movl 240(%rcx),%eax movups (%r8),%xmm6 movdqu (%r9),%xmm3 @@ -1027,6 +1032,7 @@ .align 16 aesni_ctr32_encrypt_blocks: .cfi_startproc +.byte 243,15,30,250 cmpq $1,%rdx jne .Lctr32_bulk @@ -1605,6 +1611,7 @@ .align 16 aesni_xts_encrypt: .cfi_startproc +.byte 243,15,30,250 leaq (%rsp),%r11 .cfi_def_cfa_register %r11 pushq %rbp @@ -2075,6 +2082,7 @@ .align 16 aesni_xts_decrypt: .cfi_startproc +.byte 243,15,30,250 leaq (%rsp),%r11 .cfi_def_cfa_register %r11 pushq %rbp @@ -2582,6 +2590,7 @@ .align 32 aesni_ocb_encrypt: .cfi_startproc +.byte 243,15,30,250 leaq (%rsp),%rax pushq %rbx .cfi_adjust_cfa_offset 8 @@ -3009,6 +3018,7 @@ .align 32 aesni_ocb_decrypt: .cfi_startproc +.byte 243,15,30,250 leaq (%rsp),%rax pushq %rbx .cfi_adjust_cfa_offset 8 @@ -3446,6 +3456,7 @@ .align 16 aesni_cbc_encrypt: .cfi_startproc +.byte 243,15,30,250 testq %rdx,%rdx jz .Lcbc_ret @@ -4473,3 +4484,24 @@ .byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 64 + .section ".note.gnu.property", "a" + .p2align 3 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + # "GNU" encoded with .byte, since .asciz isn't supported + # on Solaris. + .byte 0x47 + .byte 0x4e + .byte 0x55 + .byte 0 +1: + .p2align 3 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 3 +4: diff --git a/sys/crypto/openssl/amd64/bsaes-x86_64.S b/sys/crypto/openssl/amd64/bsaes-x86_64.S new file mode 100644 --- /dev/null +++ b/sys/crypto/openssl/amd64/bsaes-x86_64.S @@ -0,0 +1,2619 @@ +/* Do not modify. This file is auto-generated from bsaes-x86_64.pl. */ +.text + + + + +.type _bsaes_encrypt8,@function +.align 64 +_bsaes_encrypt8: +.cfi_startproc + leaq .LBS0(%rip),%r11 + + movdqa (%rax),%xmm8 + leaq 16(%rax),%rax + movdqa 80(%r11),%xmm7 + pxor %xmm8,%xmm15 + pxor %xmm8,%xmm0 + pxor %xmm8,%xmm1 + pxor %xmm8,%xmm2 +.byte 102,68,15,56,0,255 +.byte 102,15,56,0,199 + pxor %xmm8,%xmm3 + pxor %xmm8,%xmm4 +.byte 102,15,56,0,207 +.byte 102,15,56,0,215 + pxor %xmm8,%xmm5 + pxor %xmm8,%xmm6 +.byte 102,15,56,0,223 +.byte 102,15,56,0,231 +.byte 102,15,56,0,239 +.byte 102,15,56,0,247 +_bsaes_encrypt8_bitslice: + movdqa 0(%r11),%xmm7 + movdqa 16(%r11),%xmm8 + movdqa %xmm5,%xmm9 + psrlq $1,%xmm5 + movdqa %xmm3,%xmm10 + psrlq $1,%xmm3 + pxor %xmm6,%xmm5 + pxor %xmm4,%xmm3 + pand %xmm7,%xmm5 + pand %xmm7,%xmm3 + pxor %xmm5,%xmm6 + psllq $1,%xmm5 + pxor %xmm3,%xmm4 + psllq $1,%xmm3 + pxor %xmm9,%xmm5 + pxor %xmm10,%xmm3 + movdqa %xmm1,%xmm9 + psrlq $1,%xmm1 + movdqa %xmm15,%xmm10 + psrlq $1,%xmm15 + pxor %xmm2,%xmm1 + pxor %xmm0,%xmm15 + pand %xmm7,%xmm1 + pand %xmm7,%xmm15 + pxor %xmm1,%xmm2 + psllq $1,%xmm1 + pxor %xmm15,%xmm0 + psllq $1,%xmm15 + pxor %xmm9,%xmm1 + pxor %xmm10,%xmm15 + movdqa 32(%r11),%xmm7 + movdqa %xmm4,%xmm9 + psrlq $2,%xmm4 + movdqa %xmm3,%xmm10 + psrlq $2,%xmm3 + pxor %xmm6,%xmm4 + pxor %xmm5,%xmm3 + pand %xmm8,%xmm4 + pand %xmm8,%xmm3 + pxor %xmm4,%xmm6 + psllq $2,%xmm4 + pxor %xmm3,%xmm5 + psllq $2,%xmm3 + pxor %xmm9,%xmm4 + pxor %xmm10,%xmm3 + movdqa %xmm0,%xmm9 + psrlq $2,%xmm0 + movdqa %xmm15,%xmm10 + psrlq $2,%xmm15 + pxor %xmm2,%xmm0 + pxor %xmm1,%xmm15 + pand %xmm8,%xmm0 + pand %xmm8,%xmm15 + pxor %xmm0,%xmm2 + psllq $2,%xmm0 + pxor %xmm15,%xmm1 + psllq $2,%xmm15 + pxor %xmm9,%xmm0 + pxor %xmm10,%xmm15 + movdqa %xmm2,%xmm9 + psrlq $4,%xmm2 + movdqa %xmm1,%xmm10 + psrlq $4,%xmm1 + pxor %xmm6,%xmm2 + pxor %xmm5,%xmm1 + pand %xmm7,%xmm2 + pand %xmm7,%xmm1 + pxor %xmm2,%xmm6 + psllq $4,%xmm2 + pxor %xmm1,%xmm5 + psllq $4,%xmm1 + pxor %xmm9,%xmm2 + pxor %xmm10,%xmm1 + movdqa %xmm0,%xmm9 + psrlq $4,%xmm0 + movdqa %xmm15,%xmm10 + psrlq $4,%xmm15 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pand %xmm7,%xmm0 + pand %xmm7,%xmm15 + pxor %xmm0,%xmm4 + psllq $4,%xmm0 + pxor %xmm15,%xmm3 + psllq $4,%xmm15 + pxor %xmm9,%xmm0 + pxor %xmm10,%xmm15 + decl %r10d + jmp .Lenc_sbox +.align 16 +.Lenc_loop: + pxor 0(%rax),%xmm15 + pxor 16(%rax),%xmm0 + pxor 32(%rax),%xmm1 + pxor 48(%rax),%xmm2 +.byte 102,68,15,56,0,255 +.byte 102,15,56,0,199 + pxor 64(%rax),%xmm3 + pxor 80(%rax),%xmm4 +.byte 102,15,56,0,207 +.byte 102,15,56,0,215 + pxor 96(%rax),%xmm5 + pxor 112(%rax),%xmm6 +.byte 102,15,56,0,223 +.byte 102,15,56,0,231 +.byte 102,15,56,0,239 +.byte 102,15,56,0,247 + leaq 128(%rax),%rax +.Lenc_sbox: + pxor %xmm5,%xmm4 + pxor %xmm0,%xmm1 + pxor %xmm15,%xmm2 + pxor %xmm1,%xmm5 + pxor %xmm15,%xmm4 + + pxor %xmm2,%xmm5 + pxor %xmm6,%xmm2 + pxor %xmm4,%xmm6 + pxor %xmm3,%xmm2 + pxor %xmm4,%xmm3 + pxor %xmm0,%xmm2 + + pxor %xmm6,%xmm1 + pxor %xmm4,%xmm0 + movdqa %xmm6,%xmm10 + movdqa %xmm0,%xmm9 + movdqa %xmm4,%xmm8 + movdqa %xmm1,%xmm12 + movdqa %xmm5,%xmm11 + + pxor %xmm3,%xmm10 + pxor %xmm1,%xmm9 + pxor %xmm2,%xmm8 + movdqa %xmm10,%xmm13 + pxor %xmm3,%xmm12 + movdqa %xmm9,%xmm7 + pxor %xmm15,%xmm11 + movdqa %xmm10,%xmm14 + + por %xmm8,%xmm9 + por %xmm11,%xmm10 + pxor %xmm7,%xmm14 + pand %xmm11,%xmm13 + pxor %xmm8,%xmm11 + pand %xmm8,%xmm7 + pand %xmm11,%xmm14 + movdqa %xmm2,%xmm11 + pxor %xmm15,%xmm11 + pand %xmm11,%xmm12 + pxor %xmm12,%xmm10 + pxor %xmm12,%xmm9 + movdqa %xmm6,%xmm12 + movdqa %xmm4,%xmm11 + pxor %xmm0,%xmm12 + pxor %xmm5,%xmm11 + movdqa %xmm12,%xmm8 + pand %xmm11,%xmm12 + por %xmm11,%xmm8 + pxor %xmm12,%xmm7 + pxor %xmm14,%xmm10 + pxor %xmm13,%xmm9 + pxor %xmm14,%xmm8 + movdqa %xmm1,%xmm11 + pxor %xmm13,%xmm7 + movdqa %xmm3,%xmm12 + pxor %xmm13,%xmm8 + movdqa %xmm0,%xmm13 + pand %xmm2,%xmm11 + movdqa %xmm6,%xmm14 + pand %xmm15,%xmm12 + pand %xmm4,%xmm13 + por %xmm5,%xmm14 + pxor %xmm11,%xmm10 + pxor %xmm12,%xmm9 + pxor %xmm13,%xmm8 + pxor %xmm14,%xmm7 + + + + + + movdqa %xmm10,%xmm11 + pand %xmm8,%xmm10 + pxor %xmm9,%xmm11 + + movdqa %xmm7,%xmm13 + movdqa %xmm11,%xmm14 + pxor %xmm10,%xmm13 + pand %xmm13,%xmm14 + + movdqa %xmm8,%xmm12 + pxor %xmm9,%xmm14 + pxor %xmm7,%xmm12 + + pxor %xmm9,%xmm10 + + pand %xmm10,%xmm12 + + movdqa %xmm13,%xmm9 + pxor %xmm7,%xmm12 + + pxor %xmm12,%xmm9 + pxor %xmm12,%xmm8 + + pand %xmm7,%xmm9 + + pxor %xmm9,%xmm13 + pxor %xmm9,%xmm8 + + pand %xmm14,%xmm13 + + pxor %xmm11,%xmm13 + movdqa %xmm5,%xmm11 + movdqa %xmm4,%xmm7 + movdqa %xmm14,%xmm9 + pxor %xmm13,%xmm9 + pand %xmm5,%xmm9 + pxor %xmm4,%xmm5 + pand %xmm14,%xmm4 + pand %xmm13,%xmm5 + pxor %xmm4,%xmm5 + pxor %xmm9,%xmm4 + pxor %xmm15,%xmm11 + pxor %xmm2,%xmm7 + pxor %xmm12,%xmm14 + pxor %xmm8,%xmm13 + movdqa %xmm14,%xmm10 + movdqa %xmm12,%xmm9 + pxor %xmm13,%xmm10 + pxor %xmm8,%xmm9 + pand %xmm11,%xmm10 + pand %xmm15,%xmm9 + pxor %xmm7,%xmm11 + pxor %xmm2,%xmm15 + pand %xmm14,%xmm7 + pand %xmm12,%xmm2 + pand %xmm13,%xmm11 + pand %xmm8,%xmm15 + pxor %xmm11,%xmm7 + pxor %xmm2,%xmm15 + pxor %xmm10,%xmm11 + pxor %xmm9,%xmm2 + pxor %xmm11,%xmm5 + pxor %xmm11,%xmm15 + pxor %xmm7,%xmm4 + pxor %xmm7,%xmm2 + + movdqa %xmm6,%xmm11 + movdqa %xmm0,%xmm7 + pxor %xmm3,%xmm11 + pxor %xmm1,%xmm7 + movdqa %xmm14,%xmm10 + movdqa %xmm12,%xmm9 + pxor %xmm13,%xmm10 + pxor %xmm8,%xmm9 + pand %xmm11,%xmm10 + pand %xmm3,%xmm9 + pxor %xmm7,%xmm11 + pxor %xmm1,%xmm3 + pand %xmm14,%xmm7 + pand %xmm12,%xmm1 + pand %xmm13,%xmm11 + pand %xmm8,%xmm3 + pxor %xmm11,%xmm7 + pxor %xmm1,%xmm3 + pxor %xmm10,%xmm11 + pxor %xmm9,%xmm1 + pxor %xmm12,%xmm14 + pxor %xmm8,%xmm13 + movdqa %xmm14,%xmm10 + pxor %xmm13,%xmm10 + pand %xmm6,%xmm10 + pxor %xmm0,%xmm6 + pand %xmm14,%xmm0 + pand %xmm13,%xmm6 + pxor %xmm0,%xmm6 + pxor %xmm10,%xmm0 + pxor %xmm11,%xmm6 + pxor %xmm11,%xmm3 + pxor %xmm7,%xmm0 + pxor %xmm7,%xmm1 + pxor %xmm15,%xmm6 + pxor %xmm5,%xmm0 + pxor %xmm6,%xmm3 + pxor %xmm15,%xmm5 + pxor %xmm0,%xmm15 + + pxor %xmm4,%xmm0 + pxor %xmm1,%xmm4 + pxor %xmm2,%xmm1 + pxor %xmm4,%xmm2 + pxor %xmm4,%xmm3 + + pxor %xmm2,%xmm5 + decl %r10d + jl .Lenc_done + pshufd $0x93,%xmm15,%xmm7 + pshufd $0x93,%xmm0,%xmm8 + pxor %xmm7,%xmm15 + pshufd $0x93,%xmm3,%xmm9 + pxor %xmm8,%xmm0 + pshufd $0x93,%xmm5,%xmm10 + pxor %xmm9,%xmm3 + pshufd $0x93,%xmm2,%xmm11 + pxor %xmm10,%xmm5 + pshufd $0x93,%xmm6,%xmm12 + pxor %xmm11,%xmm2 + pshufd $0x93,%xmm1,%xmm13 + pxor %xmm12,%xmm6 + pshufd $0x93,%xmm4,%xmm14 + pxor %xmm13,%xmm1 + pxor %xmm14,%xmm4 + + pxor %xmm15,%xmm8 + pxor %xmm4,%xmm7 + pxor %xmm4,%xmm8 + pshufd $0x4E,%xmm15,%xmm15 + pxor %xmm0,%xmm9 + pshufd $0x4E,%xmm0,%xmm0 + pxor %xmm2,%xmm12 + pxor %xmm7,%xmm15 + pxor %xmm6,%xmm13 + pxor %xmm8,%xmm0 + pxor %xmm5,%xmm11 + pshufd $0x4E,%xmm2,%xmm7 + pxor %xmm1,%xmm14 + pshufd $0x4E,%xmm6,%xmm8 + pxor %xmm3,%xmm10 + pshufd $0x4E,%xmm5,%xmm2 + pxor %xmm4,%xmm10 + pshufd $0x4E,%xmm4,%xmm6 + pxor %xmm4,%xmm11 + pshufd $0x4E,%xmm1,%xmm5 + pxor %xmm11,%xmm7 + pshufd $0x4E,%xmm3,%xmm1 + pxor %xmm12,%xmm8 + pxor %xmm10,%xmm2 + pxor %xmm14,%xmm6 + pxor %xmm13,%xmm5 + movdqa %xmm7,%xmm3 + pxor %xmm9,%xmm1 + movdqa %xmm8,%xmm4 + movdqa 48(%r11),%xmm7 + jnz .Lenc_loop + movdqa 64(%r11),%xmm7 + jmp .Lenc_loop +.align 16 +.Lenc_done: + movdqa 0(%r11),%xmm7 + movdqa 16(%r11),%xmm8 + movdqa %xmm1,%xmm9 + psrlq $1,%xmm1 + movdqa %xmm2,%xmm10 + psrlq $1,%xmm2 + pxor %xmm4,%xmm1 + pxor %xmm6,%xmm2 + pand %xmm7,%xmm1 + pand %xmm7,%xmm2 + pxor %xmm1,%xmm4 + psllq $1,%xmm1 + pxor %xmm2,%xmm6 + psllq $1,%xmm2 + pxor %xmm9,%xmm1 + pxor %xmm10,%xmm2 + movdqa %xmm3,%xmm9 + psrlq $1,%xmm3 + movdqa %xmm15,%xmm10 + psrlq $1,%xmm15 + pxor %xmm5,%xmm3 + pxor %xmm0,%xmm15 + pand %xmm7,%xmm3 + pand %xmm7,%xmm15 + pxor %xmm3,%xmm5 + psllq $1,%xmm3 + pxor %xmm15,%xmm0 + psllq $1,%xmm15 + pxor %xmm9,%xmm3 + pxor %xmm10,%xmm15 + movdqa 32(%r11),%xmm7 + movdqa %xmm6,%xmm9 + psrlq $2,%xmm6 + movdqa %xmm2,%xmm10 + psrlq $2,%xmm2 + pxor %xmm4,%xmm6 + pxor %xmm1,%xmm2 + pand %xmm8,%xmm6 + pand %xmm8,%xmm2 + pxor %xmm6,%xmm4 + psllq $2,%xmm6 + pxor %xmm2,%xmm1 + psllq $2,%xmm2 + pxor %xmm9,%xmm6 + pxor %xmm10,%xmm2 + movdqa %xmm0,%xmm9 + psrlq $2,%xmm0 + movdqa %xmm15,%xmm10 + psrlq $2,%xmm15 + pxor %xmm5,%xmm0 + pxor %xmm3,%xmm15 + pand %xmm8,%xmm0 + pand %xmm8,%xmm15 + pxor %xmm0,%xmm5 + psllq $2,%xmm0 + pxor %xmm15,%xmm3 + psllq $2,%xmm15 + pxor %xmm9,%xmm0 + pxor %xmm10,%xmm15 + movdqa %xmm5,%xmm9 + psrlq $4,%xmm5 + movdqa %xmm3,%xmm10 + psrlq $4,%xmm3 + pxor %xmm4,%xmm5 + pxor %xmm1,%xmm3 + pand %xmm7,%xmm5 + pand %xmm7,%xmm3 + pxor %xmm5,%xmm4 + psllq $4,%xmm5 + pxor %xmm3,%xmm1 + psllq $4,%xmm3 + pxor %xmm9,%xmm5 + pxor %xmm10,%xmm3 + movdqa %xmm0,%xmm9 + psrlq $4,%xmm0 + movdqa %xmm15,%xmm10 + psrlq $4,%xmm15 + pxor %xmm6,%xmm0 + pxor %xmm2,%xmm15 + pand %xmm7,%xmm0 + pand %xmm7,%xmm15 + pxor %xmm0,%xmm6 + psllq $4,%xmm0 + pxor %xmm15,%xmm2 + psllq $4,%xmm15 + pxor %xmm9,%xmm0 + pxor %xmm10,%xmm15 + movdqa (%rax),%xmm7 + pxor %xmm7,%xmm3 + pxor %xmm7,%xmm5 + pxor %xmm7,%xmm2 + pxor %xmm7,%xmm6 + pxor %xmm7,%xmm1 + pxor %xmm7,%xmm4 + pxor %xmm7,%xmm15 + pxor %xmm7,%xmm0 + .byte 0xf3,0xc3 +.cfi_endproc +.size _bsaes_encrypt8,.-_bsaes_encrypt8 + +.type _bsaes_decrypt8,@function +.align 64 +_bsaes_decrypt8: +.cfi_startproc + leaq .LBS0(%rip),%r11 + + movdqa (%rax),%xmm8 + leaq 16(%rax),%rax + movdqa -48(%r11),%xmm7 + pxor %xmm8,%xmm15 + pxor %xmm8,%xmm0 + pxor %xmm8,%xmm1 + pxor %xmm8,%xmm2 +.byte 102,68,15,56,0,255 +.byte 102,15,56,0,199 + pxor %xmm8,%xmm3 + pxor %xmm8,%xmm4 +.byte 102,15,56,0,207 +.byte 102,15,56,0,215 + pxor %xmm8,%xmm5 + pxor %xmm8,%xmm6 +.byte 102,15,56,0,223 +.byte 102,15,56,0,231 +.byte 102,15,56,0,239 +.byte 102,15,56,0,247 + movdqa 0(%r11),%xmm7 + movdqa 16(%r11),%xmm8 + movdqa %xmm5,%xmm9 + psrlq $1,%xmm5 + movdqa %xmm3,%xmm10 + psrlq $1,%xmm3 + pxor %xmm6,%xmm5 + pxor %xmm4,%xmm3 + pand %xmm7,%xmm5 + pand %xmm7,%xmm3 + pxor %xmm5,%xmm6 + psllq $1,%xmm5 + pxor %xmm3,%xmm4 + psllq $1,%xmm3 + pxor %xmm9,%xmm5 + pxor %xmm10,%xmm3 + movdqa %xmm1,%xmm9 + psrlq $1,%xmm1 + movdqa %xmm15,%xmm10 + psrlq $1,%xmm15 + pxor %xmm2,%xmm1 + pxor %xmm0,%xmm15 + pand %xmm7,%xmm1 + pand %xmm7,%xmm15 + pxor %xmm1,%xmm2 + psllq $1,%xmm1 + pxor %xmm15,%xmm0 + psllq $1,%xmm15 + pxor %xmm9,%xmm1 + pxor %xmm10,%xmm15 + movdqa 32(%r11),%xmm7 + movdqa %xmm4,%xmm9 + psrlq $2,%xmm4 + movdqa %xmm3,%xmm10 + psrlq $2,%xmm3 + pxor %xmm6,%xmm4 + pxor %xmm5,%xmm3 + pand %xmm8,%xmm4 + pand %xmm8,%xmm3 + pxor %xmm4,%xmm6 + psllq $2,%xmm4 + pxor %xmm3,%xmm5 + psllq $2,%xmm3 + pxor %xmm9,%xmm4 + pxor %xmm10,%xmm3 + movdqa %xmm0,%xmm9 + psrlq $2,%xmm0 + movdqa %xmm15,%xmm10 + psrlq $2,%xmm15 + pxor %xmm2,%xmm0 + pxor %xmm1,%xmm15 + pand %xmm8,%xmm0 + pand %xmm8,%xmm15 + pxor %xmm0,%xmm2 + psllq $2,%xmm0 + pxor %xmm15,%xmm1 + psllq $2,%xmm15 + pxor %xmm9,%xmm0 + pxor %xmm10,%xmm15 + movdqa %xmm2,%xmm9 + psrlq $4,%xmm2 + movdqa %xmm1,%xmm10 + psrlq $4,%xmm1 + pxor %xmm6,%xmm2 + pxor %xmm5,%xmm1 + pand %xmm7,%xmm2 + pand %xmm7,%xmm1 + pxor %xmm2,%xmm6 + psllq $4,%xmm2 + pxor %xmm1,%xmm5 + psllq $4,%xmm1 + pxor %xmm9,%xmm2 + pxor %xmm10,%xmm1 + movdqa %xmm0,%xmm9 + psrlq $4,%xmm0 + movdqa %xmm15,%xmm10 + psrlq $4,%xmm15 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pand %xmm7,%xmm0 + pand %xmm7,%xmm15 + pxor %xmm0,%xmm4 + psllq $4,%xmm0 + pxor %xmm15,%xmm3 + psllq $4,%xmm15 + pxor %xmm9,%xmm0 + pxor %xmm10,%xmm15 + decl %r10d + jmp .Ldec_sbox +.align 16 +.Ldec_loop: + pxor 0(%rax),%xmm15 + pxor 16(%rax),%xmm0 + pxor 32(%rax),%xmm1 + pxor 48(%rax),%xmm2 +.byte 102,68,15,56,0,255 +.byte 102,15,56,0,199 + pxor 64(%rax),%xmm3 + pxor 80(%rax),%xmm4 +.byte 102,15,56,0,207 +.byte 102,15,56,0,215 + pxor 96(%rax),%xmm5 + pxor 112(%rax),%xmm6 +.byte 102,15,56,0,223 +.byte 102,15,56,0,231 +.byte 102,15,56,0,239 +.byte 102,15,56,0,247 + leaq 128(%rax),%rax +.Ldec_sbox: + pxor %xmm3,%xmm2 + + pxor %xmm6,%xmm3 + pxor %xmm6,%xmm1 + pxor %xmm3,%xmm5 + pxor %xmm5,%xmm6 + pxor %xmm6,%xmm0 + + pxor %xmm0,%xmm15 + pxor %xmm4,%xmm1 + pxor %xmm15,%xmm2 + pxor %xmm15,%xmm4 + pxor %xmm2,%xmm0 + movdqa %xmm2,%xmm10 + movdqa %xmm6,%xmm9 + movdqa %xmm0,%xmm8 + movdqa %xmm3,%xmm12 + movdqa %xmm4,%xmm11 + + pxor %xmm15,%xmm10 + pxor %xmm3,%xmm9 + pxor %xmm5,%xmm8 + movdqa %xmm10,%xmm13 + pxor %xmm15,%xmm12 + movdqa %xmm9,%xmm7 + pxor %xmm1,%xmm11 + movdqa %xmm10,%xmm14 + + por %xmm8,%xmm9 + por %xmm11,%xmm10 + pxor %xmm7,%xmm14 + pand %xmm11,%xmm13 + pxor %xmm8,%xmm11 + pand %xmm8,%xmm7 + pand %xmm11,%xmm14 + movdqa %xmm5,%xmm11 + pxor %xmm1,%xmm11 + pand %xmm11,%xmm12 + pxor %xmm12,%xmm10 + pxor %xmm12,%xmm9 + movdqa %xmm2,%xmm12 + movdqa %xmm0,%xmm11 + pxor %xmm6,%xmm12 + pxor %xmm4,%xmm11 + movdqa %xmm12,%xmm8 + pand %xmm11,%xmm12 + por %xmm11,%xmm8 + pxor %xmm12,%xmm7 + pxor %xmm14,%xmm10 + pxor %xmm13,%xmm9 + pxor %xmm14,%xmm8 + movdqa %xmm3,%xmm11 + pxor %xmm13,%xmm7 + movdqa %xmm15,%xmm12 + pxor %xmm13,%xmm8 + movdqa %xmm6,%xmm13 + pand %xmm5,%xmm11 + movdqa %xmm2,%xmm14 + pand %xmm1,%xmm12 + pand %xmm0,%xmm13 + por %xmm4,%xmm14 + pxor %xmm11,%xmm10 + pxor %xmm12,%xmm9 + pxor %xmm13,%xmm8 + pxor %xmm14,%xmm7 + + + + + + movdqa %xmm10,%xmm11 + pand %xmm8,%xmm10 + pxor %xmm9,%xmm11 + + movdqa %xmm7,%xmm13 + movdqa %xmm11,%xmm14 + pxor %xmm10,%xmm13 + pand %xmm13,%xmm14 + + movdqa %xmm8,%xmm12 + pxor %xmm9,%xmm14 + pxor %xmm7,%xmm12 + + pxor %xmm9,%xmm10 + + pand %xmm10,%xmm12 + + movdqa %xmm13,%xmm9 + pxor %xmm7,%xmm12 + + pxor %xmm12,%xmm9 + pxor %xmm12,%xmm8 + + pand %xmm7,%xmm9 + + pxor %xmm9,%xmm13 + pxor %xmm9,%xmm8 + + pand %xmm14,%xmm13 + + pxor %xmm11,%xmm13 + movdqa %xmm4,%xmm11 + movdqa %xmm0,%xmm7 + movdqa %xmm14,%xmm9 + pxor %xmm13,%xmm9 + pand %xmm4,%xmm9 + pxor %xmm0,%xmm4 + pand %xmm14,%xmm0 + pand %xmm13,%xmm4 + pxor %xmm0,%xmm4 + pxor %xmm9,%xmm0 + pxor %xmm1,%xmm11 + pxor %xmm5,%xmm7 + pxor %xmm12,%xmm14 + pxor %xmm8,%xmm13 + movdqa %xmm14,%xmm10 + movdqa %xmm12,%xmm9 + pxor %xmm13,%xmm10 + pxor %xmm8,%xmm9 + pand %xmm11,%xmm10 + pand %xmm1,%xmm9 + pxor %xmm7,%xmm11 + pxor %xmm5,%xmm1 + pand %xmm14,%xmm7 + pand %xmm12,%xmm5 + pand %xmm13,%xmm11 + pand %xmm8,%xmm1 + pxor %xmm11,%xmm7 + pxor %xmm5,%xmm1 + pxor %xmm10,%xmm11 + pxor %xmm9,%xmm5 + pxor %xmm11,%xmm4 + pxor %xmm11,%xmm1 + pxor %xmm7,%xmm0 + pxor %xmm7,%xmm5 + + movdqa %xmm2,%xmm11 + movdqa %xmm6,%xmm7 + pxor %xmm15,%xmm11 + pxor %xmm3,%xmm7 + movdqa %xmm14,%xmm10 + movdqa %xmm12,%xmm9 + pxor %xmm13,%xmm10 + pxor %xmm8,%xmm9 + pand %xmm11,%xmm10 + pand %xmm15,%xmm9 + pxor %xmm7,%xmm11 + pxor %xmm3,%xmm15 + pand %xmm14,%xmm7 + pand %xmm12,%xmm3 + pand %xmm13,%xmm11 + pand %xmm8,%xmm15 + pxor %xmm11,%xmm7 + pxor %xmm3,%xmm15 + pxor %xmm10,%xmm11 + pxor %xmm9,%xmm3 + pxor %xmm12,%xmm14 + pxor %xmm8,%xmm13 + movdqa %xmm14,%xmm10 + pxor %xmm13,%xmm10 + pand %xmm2,%xmm10 + pxor %xmm6,%xmm2 + pand %xmm14,%xmm6 + pand %xmm13,%xmm2 + pxor %xmm6,%xmm2 + pxor %xmm10,%xmm6 + pxor %xmm11,%xmm2 + pxor %xmm11,%xmm15 + pxor %xmm7,%xmm6 + pxor %xmm7,%xmm3 + pxor %xmm6,%xmm0 + pxor %xmm4,%xmm5 + + pxor %xmm0,%xmm3 + pxor %xmm6,%xmm1 + pxor %xmm6,%xmm4 + pxor %xmm1,%xmm3 + pxor %xmm15,%xmm6 + pxor %xmm4,%xmm3 + pxor %xmm5,%xmm2 + pxor %xmm0,%xmm5 + pxor %xmm3,%xmm2 + + pxor %xmm15,%xmm3 + pxor %xmm2,%xmm6 + decl %r10d + jl .Ldec_done + + pshufd $0x4E,%xmm15,%xmm7 + pshufd $0x4E,%xmm2,%xmm13 + pxor %xmm15,%xmm7 + pshufd $0x4E,%xmm4,%xmm14 + pxor %xmm2,%xmm13 + pshufd $0x4E,%xmm0,%xmm8 + pxor %xmm4,%xmm14 + pshufd $0x4E,%xmm5,%xmm9 + pxor %xmm0,%xmm8 + pshufd $0x4E,%xmm3,%xmm10 + pxor %xmm5,%xmm9 + pxor %xmm13,%xmm15 + pxor %xmm13,%xmm0 + pshufd $0x4E,%xmm1,%xmm11 + pxor %xmm3,%xmm10 + pxor %xmm7,%xmm5 + pxor %xmm8,%xmm3 + pshufd $0x4E,%xmm6,%xmm12 + pxor %xmm1,%xmm11 + pxor %xmm14,%xmm0 + pxor %xmm9,%xmm1 + pxor %xmm6,%xmm12 + + pxor %xmm14,%xmm5 + pxor %xmm13,%xmm3 + pxor %xmm13,%xmm1 + pxor %xmm10,%xmm6 + pxor %xmm11,%xmm2 + pxor %xmm14,%xmm1 + pxor %xmm14,%xmm6 + pxor %xmm12,%xmm4 + pshufd $0x93,%xmm15,%xmm7 + pshufd $0x93,%xmm0,%xmm8 + pxor %xmm7,%xmm15 + pshufd $0x93,%xmm5,%xmm9 + pxor %xmm8,%xmm0 + pshufd $0x93,%xmm3,%xmm10 + pxor %xmm9,%xmm5 + pshufd $0x93,%xmm1,%xmm11 + pxor %xmm10,%xmm3 + pshufd $0x93,%xmm6,%xmm12 + pxor %xmm11,%xmm1 + pshufd $0x93,%xmm2,%xmm13 + pxor %xmm12,%xmm6 + pshufd $0x93,%xmm4,%xmm14 + pxor %xmm13,%xmm2 + pxor %xmm14,%xmm4 + + pxor %xmm15,%xmm8 + pxor %xmm4,%xmm7 + pxor %xmm4,%xmm8 + pshufd $0x4E,%xmm15,%xmm15 + pxor %xmm0,%xmm9 + pshufd $0x4E,%xmm0,%xmm0 + pxor %xmm1,%xmm12 + pxor %xmm7,%xmm15 + pxor %xmm6,%xmm13 + pxor %xmm8,%xmm0 + pxor %xmm3,%xmm11 + pshufd $0x4E,%xmm1,%xmm7 + pxor %xmm2,%xmm14 + pshufd $0x4E,%xmm6,%xmm8 + pxor %xmm5,%xmm10 + pshufd $0x4E,%xmm3,%xmm1 + pxor %xmm4,%xmm10 + pshufd $0x4E,%xmm4,%xmm6 + pxor %xmm4,%xmm11 + pshufd $0x4E,%xmm2,%xmm3 + pxor %xmm11,%xmm7 + pshufd $0x4E,%xmm5,%xmm2 + pxor %xmm12,%xmm8 + pxor %xmm1,%xmm10 + pxor %xmm14,%xmm6 + pxor %xmm3,%xmm13 + movdqa %xmm7,%xmm3 + pxor %xmm9,%xmm2 + movdqa %xmm13,%xmm5 + movdqa %xmm8,%xmm4 + movdqa %xmm2,%xmm1 + movdqa %xmm10,%xmm2 + movdqa -16(%r11),%xmm7 + jnz .Ldec_loop + movdqa -32(%r11),%xmm7 + jmp .Ldec_loop +.align 16 +.Ldec_done: + movdqa 0(%r11),%xmm7 + movdqa 16(%r11),%xmm8 + movdqa %xmm2,%xmm9 + psrlq $1,%xmm2 + movdqa %xmm1,%xmm10 + psrlq $1,%xmm1 + pxor %xmm4,%xmm2 + pxor %xmm6,%xmm1 + pand %xmm7,%xmm2 + pand %xmm7,%xmm1 + pxor %xmm2,%xmm4 + psllq $1,%xmm2 + pxor %xmm1,%xmm6 + psllq $1,%xmm1 + pxor %xmm9,%xmm2 + pxor %xmm10,%xmm1 + movdqa %xmm5,%xmm9 + psrlq $1,%xmm5 + movdqa %xmm15,%xmm10 + psrlq $1,%xmm15 + pxor %xmm3,%xmm5 + pxor %xmm0,%xmm15 + pand %xmm7,%xmm5 + pand %xmm7,%xmm15 + pxor %xmm5,%xmm3 + psllq $1,%xmm5 + pxor %xmm15,%xmm0 + psllq $1,%xmm15 + pxor %xmm9,%xmm5 + pxor %xmm10,%xmm15 + movdqa 32(%r11),%xmm7 + movdqa %xmm6,%xmm9 + psrlq $2,%xmm6 + movdqa %xmm1,%xmm10 + psrlq $2,%xmm1 + pxor %xmm4,%xmm6 + pxor %xmm2,%xmm1 + pand %xmm8,%xmm6 + pand %xmm8,%xmm1 + pxor %xmm6,%xmm4 + psllq $2,%xmm6 + pxor %xmm1,%xmm2 + psllq $2,%xmm1 + pxor %xmm9,%xmm6 + pxor %xmm10,%xmm1 + movdqa %xmm0,%xmm9 + psrlq $2,%xmm0 + movdqa %xmm15,%xmm10 + psrlq $2,%xmm15 + pxor %xmm3,%xmm0 + pxor %xmm5,%xmm15 + pand %xmm8,%xmm0 + pand %xmm8,%xmm15 + pxor %xmm0,%xmm3 + psllq $2,%xmm0 + pxor %xmm15,%xmm5 + psllq $2,%xmm15 + pxor %xmm9,%xmm0 + pxor %xmm10,%xmm15 + movdqa %xmm3,%xmm9 + psrlq $4,%xmm3 + movdqa %xmm5,%xmm10 + psrlq $4,%xmm5 + pxor %xmm4,%xmm3 + pxor %xmm2,%xmm5 + pand %xmm7,%xmm3 + pand %xmm7,%xmm5 + pxor %xmm3,%xmm4 + psllq $4,%xmm3 + pxor %xmm5,%xmm2 + psllq $4,%xmm5 + pxor %xmm9,%xmm3 + pxor %xmm10,%xmm5 + movdqa %xmm0,%xmm9 + psrlq $4,%xmm0 + movdqa %xmm15,%xmm10 + psrlq $4,%xmm15 + pxor %xmm6,%xmm0 + pxor %xmm1,%xmm15 + pand %xmm7,%xmm0 + pand %xmm7,%xmm15 + pxor %xmm0,%xmm6 + psllq $4,%xmm0 + pxor %xmm15,%xmm1 + psllq $4,%xmm15 + pxor %xmm9,%xmm0 + pxor %xmm10,%xmm15 + movdqa (%rax),%xmm7 + pxor %xmm7,%xmm5 + pxor %xmm7,%xmm3 + pxor %xmm7,%xmm1 + pxor %xmm7,%xmm6 + pxor %xmm7,%xmm2 + pxor %xmm7,%xmm4 + pxor %xmm7,%xmm15 + pxor %xmm7,%xmm0 + .byte 0xf3,0xc3 +.cfi_endproc +.size _bsaes_decrypt8,.-_bsaes_decrypt8 +.type _bsaes_key_convert,@function +.align 16 +_bsaes_key_convert: +.cfi_startproc + leaq .Lmasks(%rip),%r11 + movdqu (%rcx),%xmm7 + leaq 16(%rcx),%rcx + movdqa 0(%r11),%xmm0 + movdqa 16(%r11),%xmm1 + movdqa 32(%r11),%xmm2 + movdqa 48(%r11),%xmm3 + movdqa 64(%r11),%xmm4 + pcmpeqd %xmm5,%xmm5 + + movdqu (%rcx),%xmm6 + movdqa %xmm7,(%rax) + leaq 16(%rax),%rax + decl %r10d + jmp .Lkey_loop +.align 16 +.Lkey_loop: +.byte 102,15,56,0,244 + + movdqa %xmm0,%xmm8 + movdqa %xmm1,%xmm9 + + pand %xmm6,%xmm8 + pand %xmm6,%xmm9 + movdqa %xmm2,%xmm10 + pcmpeqb %xmm0,%xmm8 + psllq $4,%xmm0 + movdqa %xmm3,%xmm11 + pcmpeqb %xmm1,%xmm9 + psllq $4,%xmm1 + + pand %xmm6,%xmm10 + pand %xmm6,%xmm11 + movdqa %xmm0,%xmm12 + pcmpeqb %xmm2,%xmm10 + psllq $4,%xmm2 + movdqa %xmm1,%xmm13 + pcmpeqb %xmm3,%xmm11 + psllq $4,%xmm3 + + movdqa %xmm2,%xmm14 + movdqa %xmm3,%xmm15 + pxor %xmm5,%xmm8 + pxor %xmm5,%xmm9 + + pand %xmm6,%xmm12 + pand %xmm6,%xmm13 + movdqa %xmm8,0(%rax) + pcmpeqb %xmm0,%xmm12 + psrlq $4,%xmm0 + movdqa %xmm9,16(%rax) + pcmpeqb %xmm1,%xmm13 + psrlq $4,%xmm1 + leaq 16(%rcx),%rcx + + pand %xmm6,%xmm14 + pand %xmm6,%xmm15 + movdqa %xmm10,32(%rax) + pcmpeqb %xmm2,%xmm14 + psrlq $4,%xmm2 + movdqa %xmm11,48(%rax) + pcmpeqb %xmm3,%xmm15 + psrlq $4,%xmm3 + movdqu (%rcx),%xmm6 + + pxor %xmm5,%xmm13 + pxor %xmm5,%xmm14 + movdqa %xmm12,64(%rax) + movdqa %xmm13,80(%rax) + movdqa %xmm14,96(%rax) + movdqa %xmm15,112(%rax) + leaq 128(%rax),%rax + decl %r10d + jnz .Lkey_loop + + movdqa 80(%r11),%xmm7 + + .byte 0xf3,0xc3 +.cfi_endproc +.size _bsaes_key_convert,.-_bsaes_key_convert + +.globl ossl_bsaes_cbc_encrypt +.type ossl_bsaes_cbc_encrypt,@function +.align 16 +ossl_bsaes_cbc_encrypt: +.cfi_startproc +.byte 243,15,30,250 + cmpl $0,%r9d + jne asm_AES_cbc_encrypt + cmpq $128,%rdx + jb asm_AES_cbc_encrypt + + movq %rsp,%rax +.Lcbc_dec_prologue: + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + leaq -72(%rsp),%rsp +.cfi_adjust_cfa_offset 0x48 + movq %rsp,%rbp +.cfi_def_cfa_register %rbp + movl 240(%rcx),%eax + movq %rdi,%r12 + movq %rsi,%r13 + movq %rdx,%r14 + movq %rcx,%r15 + movq %r8,%rbx + shrq $4,%r14 + + movl %eax,%edx + shlq $7,%rax + subq $96,%rax + subq %rax,%rsp + + movq %rsp,%rax + movq %r15,%rcx + movl %edx,%r10d + call _bsaes_key_convert + pxor (%rsp),%xmm7 + movdqa %xmm6,(%rax) + movdqa %xmm7,(%rsp) + + movdqu (%rbx),%xmm14 + subq $8,%r14 +.Lcbc_dec_loop: + movdqu 0(%r12),%xmm15 + movdqu 16(%r12),%xmm0 + movdqu 32(%r12),%xmm1 + movdqu 48(%r12),%xmm2 + movdqu 64(%r12),%xmm3 + movdqu 80(%r12),%xmm4 + movq %rsp,%rax + movdqu 96(%r12),%xmm5 + movl %edx,%r10d + movdqu 112(%r12),%xmm6 + movdqa %xmm14,32(%rbp) + + call _bsaes_decrypt8 + + pxor 32(%rbp),%xmm15 + movdqu 0(%r12),%xmm7 + movdqu 16(%r12),%xmm8 + pxor %xmm7,%xmm0 + movdqu 32(%r12),%xmm9 + pxor %xmm8,%xmm5 + movdqu 48(%r12),%xmm10 + pxor %xmm9,%xmm3 + movdqu 64(%r12),%xmm11 + pxor %xmm10,%xmm1 + movdqu 80(%r12),%xmm12 + pxor %xmm11,%xmm6 + movdqu 96(%r12),%xmm13 + pxor %xmm12,%xmm2 + movdqu 112(%r12),%xmm14 + pxor %xmm13,%xmm4 + movdqu %xmm15,0(%r13) + leaq 128(%r12),%r12 + movdqu %xmm0,16(%r13) + movdqu %xmm5,32(%r13) + movdqu %xmm3,48(%r13) + movdqu %xmm1,64(%r13) + movdqu %xmm6,80(%r13) + movdqu %xmm2,96(%r13) + movdqu %xmm4,112(%r13) + leaq 128(%r13),%r13 + subq $8,%r14 + jnc .Lcbc_dec_loop + + addq $8,%r14 + jz .Lcbc_dec_done + + movdqu 0(%r12),%xmm15 + movq %rsp,%rax + movl %edx,%r10d + cmpq $2,%r14 + jb .Lcbc_dec_one + movdqu 16(%r12),%xmm0 + je .Lcbc_dec_two + movdqu 32(%r12),%xmm1 + cmpq $4,%r14 + jb .Lcbc_dec_three + movdqu 48(%r12),%xmm2 + je .Lcbc_dec_four + movdqu 64(%r12),%xmm3 + cmpq $6,%r14 + jb .Lcbc_dec_five + movdqu 80(%r12),%xmm4 + je .Lcbc_dec_six + movdqu 96(%r12),%xmm5 + movdqa %xmm14,32(%rbp) + call _bsaes_decrypt8 + pxor 32(%rbp),%xmm15 + movdqu 0(%r12),%xmm7 + movdqu 16(%r12),%xmm8 + pxor %xmm7,%xmm0 + movdqu 32(%r12),%xmm9 + pxor %xmm8,%xmm5 + movdqu 48(%r12),%xmm10 + pxor %xmm9,%xmm3 + movdqu 64(%r12),%xmm11 + pxor %xmm10,%xmm1 + movdqu 80(%r12),%xmm12 + pxor %xmm11,%xmm6 + movdqu 96(%r12),%xmm14 + pxor %xmm12,%xmm2 + movdqu %xmm15,0(%r13) + movdqu %xmm0,16(%r13) + movdqu %xmm5,32(%r13) + movdqu %xmm3,48(%r13) + movdqu %xmm1,64(%r13) + movdqu %xmm6,80(%r13) + movdqu %xmm2,96(%r13) + jmp .Lcbc_dec_done +.align 16 +.Lcbc_dec_six: + movdqa %xmm14,32(%rbp) + call _bsaes_decrypt8 + pxor 32(%rbp),%xmm15 + movdqu 0(%r12),%xmm7 + movdqu 16(%r12),%xmm8 + pxor %xmm7,%xmm0 + movdqu 32(%r12),%xmm9 + pxor %xmm8,%xmm5 + movdqu 48(%r12),%xmm10 + pxor %xmm9,%xmm3 + movdqu 64(%r12),%xmm11 + pxor %xmm10,%xmm1 + movdqu 80(%r12),%xmm14 + pxor %xmm11,%xmm6 + movdqu %xmm15,0(%r13) + movdqu %xmm0,16(%r13) + movdqu %xmm5,32(%r13) + movdqu %xmm3,48(%r13) + movdqu %xmm1,64(%r13) + movdqu %xmm6,80(%r13) + jmp .Lcbc_dec_done +.align 16 +.Lcbc_dec_five: + movdqa %xmm14,32(%rbp) + call _bsaes_decrypt8 + pxor 32(%rbp),%xmm15 + movdqu 0(%r12),%xmm7 + movdqu 16(%r12),%xmm8 + pxor %xmm7,%xmm0 + movdqu 32(%r12),%xmm9 + pxor %xmm8,%xmm5 + movdqu 48(%r12),%xmm10 + pxor %xmm9,%xmm3 + movdqu 64(%r12),%xmm14 + pxor %xmm10,%xmm1 + movdqu %xmm15,0(%r13) + movdqu %xmm0,16(%r13) + movdqu %xmm5,32(%r13) + movdqu %xmm3,48(%r13) + movdqu %xmm1,64(%r13) + jmp .Lcbc_dec_done +.align 16 +.Lcbc_dec_four: + movdqa %xmm14,32(%rbp) + call _bsaes_decrypt8 + pxor 32(%rbp),%xmm15 + movdqu 0(%r12),%xmm7 + movdqu 16(%r12),%xmm8 + pxor %xmm7,%xmm0 + movdqu 32(%r12),%xmm9 + pxor %xmm8,%xmm5 + movdqu 48(%r12),%xmm14 + pxor %xmm9,%xmm3 + movdqu %xmm15,0(%r13) + movdqu %xmm0,16(%r13) + movdqu %xmm5,32(%r13) + movdqu %xmm3,48(%r13) + jmp .Lcbc_dec_done +.align 16 +.Lcbc_dec_three: + movdqa %xmm14,32(%rbp) + call _bsaes_decrypt8 + pxor 32(%rbp),%xmm15 + movdqu 0(%r12),%xmm7 + movdqu 16(%r12),%xmm8 + pxor %xmm7,%xmm0 + movdqu 32(%r12),%xmm14 + pxor %xmm8,%xmm5 + movdqu %xmm15,0(%r13) + movdqu %xmm0,16(%r13) + movdqu %xmm5,32(%r13) + jmp .Lcbc_dec_done +.align 16 +.Lcbc_dec_two: + movdqa %xmm14,32(%rbp) + call _bsaes_decrypt8 + pxor 32(%rbp),%xmm15 + movdqu 0(%r12),%xmm7 + movdqu 16(%r12),%xmm14 + pxor %xmm7,%xmm0 + movdqu %xmm15,0(%r13) + movdqu %xmm0,16(%r13) + jmp .Lcbc_dec_done +.align 16 +.Lcbc_dec_one: + leaq (%r12),%rdi + leaq 32(%rbp),%rsi + leaq (%r15),%rdx + call asm_AES_decrypt + pxor 32(%rbp),%xmm14 + movdqu %xmm14,(%r13) + movdqa %xmm15,%xmm14 + +.Lcbc_dec_done: + movdqu %xmm14,(%rbx) + leaq (%rsp),%rax + pxor %xmm0,%xmm0 +.Lcbc_dec_bzero: + movdqa %xmm0,0(%rax) + movdqa %xmm0,16(%rax) + leaq 32(%rax),%rax + cmpq %rax,%rbp + ja .Lcbc_dec_bzero + + leaq 120(%rbp),%rax +.cfi_def_cfa %rax,8 + movq -48(%rax),%r15 +.cfi_restore %r15 + movq -40(%rax),%r14 +.cfi_restore %r14 + movq -32(%rax),%r13 +.cfi_restore %r13 + movq -24(%rax),%r12 +.cfi_restore %r12 + movq -16(%rax),%rbx +.cfi_restore %rbx + movq -8(%rax),%rbp +.cfi_restore %rbp + leaq (%rax),%rsp +.cfi_def_cfa_register %rsp +.Lcbc_dec_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size ossl_bsaes_cbc_encrypt,.-ossl_bsaes_cbc_encrypt + +.globl ossl_bsaes_ctr32_encrypt_blocks +.type ossl_bsaes_ctr32_encrypt_blocks,@function +.align 16 +ossl_bsaes_ctr32_encrypt_blocks: +.cfi_startproc +.byte 243,15,30,250 + movq %rsp,%rax +.Lctr_enc_prologue: + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + leaq -72(%rsp),%rsp +.cfi_adjust_cfa_offset 0x48 + movq %rsp,%rbp +.cfi_def_cfa_register %rbp + movdqu (%r8),%xmm0 + movl 240(%rcx),%eax + movq %rdi,%r12 + movq %rsi,%r13 + movq %rdx,%r14 + movq %rcx,%r15 + movdqa %xmm0,32(%rbp) + cmpq $8,%rdx + jb .Lctr_enc_short + + movl %eax,%ebx + shlq $7,%rax + subq $96,%rax + subq %rax,%rsp + + movq %rsp,%rax + movq %r15,%rcx + movl %ebx,%r10d + call _bsaes_key_convert + pxor %xmm6,%xmm7 + movdqa %xmm7,(%rax) + + movdqa (%rsp),%xmm8 + leaq .LADD1(%rip),%r11 + movdqa 32(%rbp),%xmm15 + movdqa -32(%r11),%xmm7 +.byte 102,68,15,56,0,199 +.byte 102,68,15,56,0,255 + movdqa %xmm8,(%rsp) + jmp .Lctr_enc_loop +.align 16 +.Lctr_enc_loop: + movdqa %xmm15,32(%rbp) + movdqa %xmm15,%xmm0 + movdqa %xmm15,%xmm1 + paddd 0(%r11),%xmm0 + movdqa %xmm15,%xmm2 + paddd 16(%r11),%xmm1 + movdqa %xmm15,%xmm3 + paddd 32(%r11),%xmm2 + movdqa %xmm15,%xmm4 + paddd 48(%r11),%xmm3 + movdqa %xmm15,%xmm5 + paddd 64(%r11),%xmm4 + movdqa %xmm15,%xmm6 + paddd 80(%r11),%xmm5 + paddd 96(%r11),%xmm6 + + + + movdqa (%rsp),%xmm8 + leaq 16(%rsp),%rax + movdqa -16(%r11),%xmm7 + pxor %xmm8,%xmm15 + pxor %xmm8,%xmm0 + pxor %xmm8,%xmm1 + pxor %xmm8,%xmm2 +.byte 102,68,15,56,0,255 +.byte 102,15,56,0,199 + pxor %xmm8,%xmm3 + pxor %xmm8,%xmm4 +.byte 102,15,56,0,207 +.byte 102,15,56,0,215 + pxor %xmm8,%xmm5 + pxor %xmm8,%xmm6 +.byte 102,15,56,0,223 +.byte 102,15,56,0,231 +.byte 102,15,56,0,239 +.byte 102,15,56,0,247 + leaq .LBS0(%rip),%r11 + movl %ebx,%r10d + + call _bsaes_encrypt8_bitslice + + subq $8,%r14 + jc .Lctr_enc_loop_done + + movdqu 0(%r12),%xmm7 + movdqu 16(%r12),%xmm8 + movdqu 32(%r12),%xmm9 + movdqu 48(%r12),%xmm10 + movdqu 64(%r12),%xmm11 + movdqu 80(%r12),%xmm12 + movdqu 96(%r12),%xmm13 + movdqu 112(%r12),%xmm14 + leaq 128(%r12),%r12 + pxor %xmm15,%xmm7 + movdqa 32(%rbp),%xmm15 + pxor %xmm8,%xmm0 + movdqu %xmm7,0(%r13) + pxor %xmm9,%xmm3 + movdqu %xmm0,16(%r13) + pxor %xmm10,%xmm5 + movdqu %xmm3,32(%r13) + pxor %xmm11,%xmm2 + movdqu %xmm5,48(%r13) + pxor %xmm12,%xmm6 + movdqu %xmm2,64(%r13) + pxor %xmm13,%xmm1 + movdqu %xmm6,80(%r13) + pxor %xmm14,%xmm4 + movdqu %xmm1,96(%r13) + leaq .LADD1(%rip),%r11 + movdqu %xmm4,112(%r13) + leaq 128(%r13),%r13 + paddd 112(%r11),%xmm15 + jnz .Lctr_enc_loop + + jmp .Lctr_enc_done +.align 16 +.Lctr_enc_loop_done: + addq $8,%r14 + movdqu 0(%r12),%xmm7 + pxor %xmm7,%xmm15 + movdqu %xmm15,0(%r13) + cmpq $2,%r14 + jb .Lctr_enc_done + movdqu 16(%r12),%xmm8 + pxor %xmm8,%xmm0 + movdqu %xmm0,16(%r13) + je .Lctr_enc_done + movdqu 32(%r12),%xmm9 + pxor %xmm9,%xmm3 + movdqu %xmm3,32(%r13) + cmpq $4,%r14 + jb .Lctr_enc_done + movdqu 48(%r12),%xmm10 + pxor %xmm10,%xmm5 + movdqu %xmm5,48(%r13) + je .Lctr_enc_done + movdqu 64(%r12),%xmm11 + pxor %xmm11,%xmm2 + movdqu %xmm2,64(%r13) + cmpq $6,%r14 + jb .Lctr_enc_done + movdqu 80(%r12),%xmm12 + pxor %xmm12,%xmm6 + movdqu %xmm6,80(%r13) + je .Lctr_enc_done + movdqu 96(%r12),%xmm13 + pxor %xmm13,%xmm1 + movdqu %xmm1,96(%r13) + jmp .Lctr_enc_done + +.align 16 +.Lctr_enc_short: + leaq 32(%rbp),%rdi + leaq 48(%rbp),%rsi + leaq (%r15),%rdx + call asm_AES_encrypt + movdqu (%r12),%xmm0 + leaq 16(%r12),%r12 + movl 44(%rbp),%eax + bswapl %eax + pxor 48(%rbp),%xmm0 + incl %eax + movdqu %xmm0,(%r13) + bswapl %eax + leaq 16(%r13),%r13 + movl %eax,44(%rsp) + decq %r14 + jnz .Lctr_enc_short + +.Lctr_enc_done: + leaq (%rsp),%rax + pxor %xmm0,%xmm0 +.Lctr_enc_bzero: + movdqa %xmm0,0(%rax) + movdqa %xmm0,16(%rax) + leaq 32(%rax),%rax + cmpq %rax,%rbp + ja .Lctr_enc_bzero + + leaq 120(%rbp),%rax +.cfi_def_cfa %rax,8 + movq -48(%rax),%r15 +.cfi_restore %r15 + movq -40(%rax),%r14 +.cfi_restore %r14 + movq -32(%rax),%r13 +.cfi_restore %r13 + movq -24(%rax),%r12 +.cfi_restore %r12 + movq -16(%rax),%rbx +.cfi_restore %rbx + movq -8(%rax),%rbp +.cfi_restore %rbp + leaq (%rax),%rsp +.cfi_def_cfa_register %rsp +.Lctr_enc_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size ossl_bsaes_ctr32_encrypt_blocks,.-ossl_bsaes_ctr32_encrypt_blocks +.globl ossl_bsaes_xts_encrypt +.type ossl_bsaes_xts_encrypt,@function +.align 16 +ossl_bsaes_xts_encrypt: +.cfi_startproc + movq %rsp,%rax +.Lxts_enc_prologue: + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + leaq -72(%rsp),%rsp +.cfi_adjust_cfa_offset 0x48 + movq %rsp,%rbp +.cfi_def_cfa_register %rbp + movq %rdi,%r12 + movq %rsi,%r13 + movq %rdx,%r14 + movq %rcx,%r15 + + leaq (%r9),%rdi + leaq 32(%rbp),%rsi + leaq (%r8),%rdx + call asm_AES_encrypt + + movl 240(%r15),%eax + movq %r14,%rbx + + movl %eax,%edx + shlq $7,%rax + subq $96,%rax + subq %rax,%rsp + + movq %rsp,%rax + movq %r15,%rcx + movl %edx,%r10d + call _bsaes_key_convert + pxor %xmm6,%xmm7 + movdqa %xmm7,(%rax) + + andq $-16,%r14 + subq $0x80,%rsp + movdqa 32(%rbp),%xmm6 + + pxor %xmm14,%xmm14 + movdqa .Lxts_magic(%rip),%xmm12 + pcmpgtd %xmm6,%xmm14 + + subq $0x80,%r14 + jc .Lxts_enc_short + jmp .Lxts_enc_loop + +.align 16 +.Lxts_enc_loop: + pshufd $0x13,%xmm14,%xmm13 + pxor %xmm14,%xmm14 + movdqa %xmm6,%xmm15 + movdqa %xmm6,0(%rsp) + paddq %xmm6,%xmm6 + pand %xmm12,%xmm13 + pcmpgtd %xmm6,%xmm14 + pxor %xmm13,%xmm6 + pshufd $0x13,%xmm14,%xmm13 + pxor %xmm14,%xmm14 + movdqa %xmm6,%xmm0 + movdqa %xmm6,16(%rsp) + paddq %xmm6,%xmm6 + pand %xmm12,%xmm13 + pcmpgtd %xmm6,%xmm14 + pxor %xmm13,%xmm6 + movdqu 0(%r12),%xmm7 + pshufd $0x13,%xmm14,%xmm13 + pxor %xmm14,%xmm14 + movdqa %xmm6,%xmm1 + movdqa %xmm6,32(%rsp) + paddq %xmm6,%xmm6 + pand %xmm12,%xmm13 + pcmpgtd %xmm6,%xmm14 + pxor %xmm13,%xmm6 + movdqu 16(%r12),%xmm8 + pxor %xmm7,%xmm15 + pshufd $0x13,%xmm14,%xmm13 + pxor %xmm14,%xmm14 + movdqa %xmm6,%xmm2 + movdqa %xmm6,48(%rsp) + paddq %xmm6,%xmm6 + pand %xmm12,%xmm13 + pcmpgtd %xmm6,%xmm14 + pxor %xmm13,%xmm6 + movdqu 32(%r12),%xmm9 + pxor %xmm8,%xmm0 + pshufd $0x13,%xmm14,%xmm13 + pxor %xmm14,%xmm14 + movdqa %xmm6,%xmm3 + movdqa %xmm6,64(%rsp) + paddq %xmm6,%xmm6 + pand %xmm12,%xmm13 + pcmpgtd %xmm6,%xmm14 + pxor %xmm13,%xmm6 + movdqu 48(%r12),%xmm10 + pxor %xmm9,%xmm1 + pshufd $0x13,%xmm14,%xmm13 + pxor %xmm14,%xmm14 + movdqa %xmm6,%xmm4 + movdqa %xmm6,80(%rsp) + paddq %xmm6,%xmm6 + pand %xmm12,%xmm13 + pcmpgtd %xmm6,%xmm14 + pxor %xmm13,%xmm6 + movdqu 64(%r12),%xmm11 + pxor %xmm10,%xmm2 + pshufd $0x13,%xmm14,%xmm13 + pxor %xmm14,%xmm14 + movdqa %xmm6,%xmm5 + movdqa %xmm6,96(%rsp) + paddq %xmm6,%xmm6 + pand %xmm12,%xmm13 + pcmpgtd %xmm6,%xmm14 + pxor %xmm13,%xmm6 + movdqu 80(%r12),%xmm12 + pxor %xmm11,%xmm3 + movdqu 96(%r12),%xmm13 + pxor %xmm12,%xmm4 + movdqu 112(%r12),%xmm14 + leaq 128(%r12),%r12 + movdqa %xmm6,112(%rsp) + pxor %xmm13,%xmm5 + leaq 128(%rsp),%rax + pxor %xmm14,%xmm6 + movl %edx,%r10d + + call _bsaes_encrypt8 + + pxor 0(%rsp),%xmm15 + pxor 16(%rsp),%xmm0 + movdqu %xmm15,0(%r13) + pxor 32(%rsp),%xmm3 + movdqu %xmm0,16(%r13) + pxor 48(%rsp),%xmm5 + movdqu %xmm3,32(%r13) + pxor 64(%rsp),%xmm2 + movdqu %xmm5,48(%r13) + pxor 80(%rsp),%xmm6 + movdqu %xmm2,64(%r13) + pxor 96(%rsp),%xmm1 + movdqu %xmm6,80(%r13) + pxor 112(%rsp),%xmm4 + movdqu %xmm1,96(%r13) + movdqu %xmm4,112(%r13) + leaq 128(%r13),%r13 + + movdqa 112(%rsp),%xmm6 + pxor %xmm14,%xmm14 + movdqa .Lxts_magic(%rip),%xmm12 + pcmpgtd %xmm6,%xmm14 + pshufd $0x13,%xmm14,%xmm13 + pxor %xmm14,%xmm14 + paddq %xmm6,%xmm6 + pand %xmm12,%xmm13 + pcmpgtd %xmm6,%xmm14 + pxor %xmm13,%xmm6 + + subq $0x80,%r14 + jnc .Lxts_enc_loop + +.Lxts_enc_short: + addq $0x80,%r14 + jz .Lxts_enc_done + pshufd $0x13,%xmm14,%xmm13 + pxor %xmm14,%xmm14 + movdqa %xmm6,%xmm15 + movdqa %xmm6,0(%rsp) + paddq %xmm6,%xmm6 + pand %xmm12,%xmm13 + pcmpgtd %xmm6,%xmm14 + pxor %xmm13,%xmm6 + pshufd $0x13,%xmm14,%xmm13 + pxor %xmm14,%xmm14 + movdqa %xmm6,%xmm0 + movdqa %xmm6,16(%rsp) + paddq %xmm6,%xmm6 + pand %xmm12,%xmm13 + pcmpgtd %xmm6,%xmm14 + pxor %xmm13,%xmm6 + movdqu 0(%r12),%xmm7 + cmpq $16,%r14 + je .Lxts_enc_1 + pshufd $0x13,%xmm14,%xmm13 + pxor %xmm14,%xmm14 + movdqa %xmm6,%xmm1 + movdqa %xmm6,32(%rsp) + paddq %xmm6,%xmm6 + pand %xmm12,%xmm13 + pcmpgtd %xmm6,%xmm14 + pxor %xmm13,%xmm6 + movdqu 16(%r12),%xmm8 + cmpq $32,%r14 + je .Lxts_enc_2 + pxor %xmm7,%xmm15 + pshufd $0x13,%xmm14,%xmm13 + pxor %xmm14,%xmm14 + movdqa %xmm6,%xmm2 + movdqa %xmm6,48(%rsp) + paddq %xmm6,%xmm6 + pand %xmm12,%xmm13 + pcmpgtd %xmm6,%xmm14 + pxor %xmm13,%xmm6 + movdqu 32(%r12),%xmm9 + cmpq $48,%r14 + je .Lxts_enc_3 + pxor %xmm8,%xmm0 + pshufd $0x13,%xmm14,%xmm13 + pxor %xmm14,%xmm14 + movdqa %xmm6,%xmm3 + movdqa %xmm6,64(%rsp) + paddq %xmm6,%xmm6 + pand %xmm12,%xmm13 + pcmpgtd %xmm6,%xmm14 + pxor %xmm13,%xmm6 + movdqu 48(%r12),%xmm10 + cmpq $64,%r14 + je .Lxts_enc_4 + pxor %xmm9,%xmm1 + pshufd $0x13,%xmm14,%xmm13 + pxor %xmm14,%xmm14 + movdqa %xmm6,%xmm4 + movdqa %xmm6,80(%rsp) + paddq %xmm6,%xmm6 + pand %xmm12,%xmm13 + pcmpgtd %xmm6,%xmm14 + pxor %xmm13,%xmm6 + movdqu 64(%r12),%xmm11 + cmpq $80,%r14 + je .Lxts_enc_5 + pxor %xmm10,%xmm2 + pshufd $0x13,%xmm14,%xmm13 + pxor %xmm14,%xmm14 + movdqa %xmm6,%xmm5 + movdqa %xmm6,96(%rsp) + paddq %xmm6,%xmm6 + pand %xmm12,%xmm13 + pcmpgtd %xmm6,%xmm14 + pxor %xmm13,%xmm6 + movdqu 80(%r12),%xmm12 + cmpq $96,%r14 + je .Lxts_enc_6 + pxor %xmm11,%xmm3 + movdqu 96(%r12),%xmm13 + pxor %xmm12,%xmm4 + movdqa %xmm6,112(%rsp) + leaq 112(%r12),%r12 + pxor %xmm13,%xmm5 + leaq 128(%rsp),%rax + movl %edx,%r10d + + call _bsaes_encrypt8 + + pxor 0(%rsp),%xmm15 + pxor 16(%rsp),%xmm0 + movdqu %xmm15,0(%r13) + pxor 32(%rsp),%xmm3 + movdqu %xmm0,16(%r13) + pxor 48(%rsp),%xmm5 + movdqu %xmm3,32(%r13) + pxor 64(%rsp),%xmm2 + movdqu %xmm5,48(%r13) + pxor 80(%rsp),%xmm6 + movdqu %xmm2,64(%r13) + pxor 96(%rsp),%xmm1 + movdqu %xmm6,80(%r13) + movdqu %xmm1,96(%r13) + leaq 112(%r13),%r13 + + movdqa 112(%rsp),%xmm6 + jmp .Lxts_enc_done +.align 16 +.Lxts_enc_6: + pxor %xmm11,%xmm3 + leaq 96(%r12),%r12 + pxor %xmm12,%xmm4 + leaq 128(%rsp),%rax + movl %edx,%r10d + + call _bsaes_encrypt8 + + pxor 0(%rsp),%xmm15 + pxor 16(%rsp),%xmm0 + movdqu %xmm15,0(%r13) + pxor 32(%rsp),%xmm3 + movdqu %xmm0,16(%r13) + pxor 48(%rsp),%xmm5 + movdqu %xmm3,32(%r13) + pxor 64(%rsp),%xmm2 + movdqu %xmm5,48(%r13) + pxor 80(%rsp),%xmm6 + movdqu %xmm2,64(%r13) + movdqu %xmm6,80(%r13) + leaq 96(%r13),%r13 + + movdqa 96(%rsp),%xmm6 + jmp .Lxts_enc_done +.align 16 +.Lxts_enc_5: + pxor %xmm10,%xmm2 + leaq 80(%r12),%r12 + pxor %xmm11,%xmm3 + leaq 128(%rsp),%rax + movl %edx,%r10d + + call _bsaes_encrypt8 + + pxor 0(%rsp),%xmm15 + pxor 16(%rsp),%xmm0 + movdqu %xmm15,0(%r13) + pxor 32(%rsp),%xmm3 + movdqu %xmm0,16(%r13) + pxor 48(%rsp),%xmm5 + movdqu %xmm3,32(%r13) + pxor 64(%rsp),%xmm2 + movdqu %xmm5,48(%r13) + movdqu %xmm2,64(%r13) + leaq 80(%r13),%r13 + + movdqa 80(%rsp),%xmm6 + jmp .Lxts_enc_done +.align 16 +.Lxts_enc_4: + pxor %xmm9,%xmm1 + leaq 64(%r12),%r12 + pxor %xmm10,%xmm2 + leaq 128(%rsp),%rax + movl %edx,%r10d + + call _bsaes_encrypt8 + + pxor 0(%rsp),%xmm15 + pxor 16(%rsp),%xmm0 + movdqu %xmm15,0(%r13) + pxor 32(%rsp),%xmm3 + movdqu %xmm0,16(%r13) + pxor 48(%rsp),%xmm5 + movdqu %xmm3,32(%r13) + movdqu %xmm5,48(%r13) + leaq 64(%r13),%r13 + + movdqa 64(%rsp),%xmm6 + jmp .Lxts_enc_done +.align 16 +.Lxts_enc_3: + pxor %xmm8,%xmm0 + leaq 48(%r12),%r12 + pxor %xmm9,%xmm1 + leaq 128(%rsp),%rax + movl %edx,%r10d + + call _bsaes_encrypt8 + + pxor 0(%rsp),%xmm15 + pxor 16(%rsp),%xmm0 + movdqu %xmm15,0(%r13) + pxor 32(%rsp),%xmm3 + movdqu %xmm0,16(%r13) + movdqu %xmm3,32(%r13) + leaq 48(%r13),%r13 + + movdqa 48(%rsp),%xmm6 + jmp .Lxts_enc_done +.align 16 +.Lxts_enc_2: + pxor %xmm7,%xmm15 + leaq 32(%r12),%r12 + pxor %xmm8,%xmm0 + leaq 128(%rsp),%rax + movl %edx,%r10d + + call _bsaes_encrypt8 + + pxor 0(%rsp),%xmm15 + pxor 16(%rsp),%xmm0 + movdqu %xmm15,0(%r13) + movdqu %xmm0,16(%r13) + leaq 32(%r13),%r13 + + movdqa 32(%rsp),%xmm6 + jmp .Lxts_enc_done +.align 16 +.Lxts_enc_1: + pxor %xmm15,%xmm7 + leaq 16(%r12),%r12 + movdqa %xmm7,32(%rbp) + leaq 32(%rbp),%rdi + leaq 32(%rbp),%rsi + leaq (%r15),%rdx + call asm_AES_encrypt + pxor 32(%rbp),%xmm15 + + + + + + movdqu %xmm15,0(%r13) + leaq 16(%r13),%r13 + + movdqa 16(%rsp),%xmm6 + +.Lxts_enc_done: + andl $15,%ebx + jz .Lxts_enc_ret + movq %r13,%rdx + +.Lxts_enc_steal: + movzbl (%r12),%eax + movzbl -16(%rdx),%ecx + leaq 1(%r12),%r12 + movb %al,-16(%rdx) + movb %cl,0(%rdx) + leaq 1(%rdx),%rdx + subl $1,%ebx + jnz .Lxts_enc_steal + + movdqu -16(%r13),%xmm15 + leaq 32(%rbp),%rdi + pxor %xmm6,%xmm15 + leaq 32(%rbp),%rsi + movdqa %xmm15,32(%rbp) + leaq (%r15),%rdx + call asm_AES_encrypt + pxor 32(%rbp),%xmm6 + movdqu %xmm6,-16(%r13) + +.Lxts_enc_ret: + leaq (%rsp),%rax + pxor %xmm0,%xmm0 +.Lxts_enc_bzero: + movdqa %xmm0,0(%rax) + movdqa %xmm0,16(%rax) + leaq 32(%rax),%rax + cmpq %rax,%rbp + ja .Lxts_enc_bzero + + leaq 120(%rbp),%rax +.cfi_def_cfa %rax,8 + movq -48(%rax),%r15 +.cfi_restore %r15 + movq -40(%rax),%r14 +.cfi_restore %r14 + movq -32(%rax),%r13 +.cfi_restore %r13 + movq -24(%rax),%r12 +.cfi_restore %r12 + movq -16(%rax),%rbx +.cfi_restore %rbx + movq -8(%rax),%rbp +.cfi_restore %rbp + leaq (%rax),%rsp +.cfi_def_cfa_register %rsp +.Lxts_enc_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size ossl_bsaes_xts_encrypt,.-ossl_bsaes_xts_encrypt + +.globl ossl_bsaes_xts_decrypt +.type ossl_bsaes_xts_decrypt,@function +.align 16 +ossl_bsaes_xts_decrypt: +.cfi_startproc + movq %rsp,%rax +.Lxts_dec_prologue: + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + leaq -72(%rsp),%rsp +.cfi_adjust_cfa_offset 0x48 + movq %rsp,%rbp + movq %rdi,%r12 + movq %rsi,%r13 + movq %rdx,%r14 + movq %rcx,%r15 + + leaq (%r9),%rdi + leaq 32(%rbp),%rsi + leaq (%r8),%rdx + call asm_AES_encrypt + + movl 240(%r15),%eax + movq %r14,%rbx + + movl %eax,%edx + shlq $7,%rax + subq $96,%rax + subq %rax,%rsp + + movq %rsp,%rax + movq %r15,%rcx + movl %edx,%r10d + call _bsaes_key_convert + pxor (%rsp),%xmm7 + movdqa %xmm6,(%rax) + movdqa %xmm7,(%rsp) + + xorl %eax,%eax + andq $-16,%r14 + testl $15,%ebx + setnz %al + shlq $4,%rax + subq %rax,%r14 + + subq $0x80,%rsp + movdqa 32(%rbp),%xmm6 + + pxor %xmm14,%xmm14 + movdqa .Lxts_magic(%rip),%xmm12 + pcmpgtd %xmm6,%xmm14 + + subq $0x80,%r14 + jc .Lxts_dec_short + jmp .Lxts_dec_loop + +.align 16 +.Lxts_dec_loop: + pshufd $0x13,%xmm14,%xmm13 + pxor %xmm14,%xmm14 + movdqa %xmm6,%xmm15 + movdqa %xmm6,0(%rsp) + paddq %xmm6,%xmm6 + pand %xmm12,%xmm13 + pcmpgtd %xmm6,%xmm14 + pxor %xmm13,%xmm6 + pshufd $0x13,%xmm14,%xmm13 + pxor %xmm14,%xmm14 + movdqa %xmm6,%xmm0 + movdqa %xmm6,16(%rsp) + paddq %xmm6,%xmm6 + pand %xmm12,%xmm13 + pcmpgtd %xmm6,%xmm14 + pxor %xmm13,%xmm6 + movdqu 0(%r12),%xmm7 + pshufd $0x13,%xmm14,%xmm13 + pxor %xmm14,%xmm14 + movdqa %xmm6,%xmm1 + movdqa %xmm6,32(%rsp) + paddq %xmm6,%xmm6 + pand %xmm12,%xmm13 + pcmpgtd %xmm6,%xmm14 + pxor %xmm13,%xmm6 + movdqu 16(%r12),%xmm8 + pxor %xmm7,%xmm15 + pshufd $0x13,%xmm14,%xmm13 + pxor %xmm14,%xmm14 + movdqa %xmm6,%xmm2 + movdqa %xmm6,48(%rsp) + paddq %xmm6,%xmm6 + pand %xmm12,%xmm13 + pcmpgtd %xmm6,%xmm14 + pxor %xmm13,%xmm6 + movdqu 32(%r12),%xmm9 + pxor %xmm8,%xmm0 + pshufd $0x13,%xmm14,%xmm13 + pxor %xmm14,%xmm14 + movdqa %xmm6,%xmm3 + movdqa %xmm6,64(%rsp) + paddq %xmm6,%xmm6 + pand %xmm12,%xmm13 + pcmpgtd %xmm6,%xmm14 + pxor %xmm13,%xmm6 + movdqu 48(%r12),%xmm10 + pxor %xmm9,%xmm1 + pshufd $0x13,%xmm14,%xmm13 + pxor %xmm14,%xmm14 + movdqa %xmm6,%xmm4 + movdqa %xmm6,80(%rsp) + paddq %xmm6,%xmm6 + pand %xmm12,%xmm13 + pcmpgtd %xmm6,%xmm14 + pxor %xmm13,%xmm6 + movdqu 64(%r12),%xmm11 + pxor %xmm10,%xmm2 + pshufd $0x13,%xmm14,%xmm13 + pxor %xmm14,%xmm14 + movdqa %xmm6,%xmm5 + movdqa %xmm6,96(%rsp) + paddq %xmm6,%xmm6 + pand %xmm12,%xmm13 + pcmpgtd %xmm6,%xmm14 + pxor %xmm13,%xmm6 + movdqu 80(%r12),%xmm12 + pxor %xmm11,%xmm3 + movdqu 96(%r12),%xmm13 + pxor %xmm12,%xmm4 + movdqu 112(%r12),%xmm14 + leaq 128(%r12),%r12 + movdqa %xmm6,112(%rsp) + pxor %xmm13,%xmm5 + leaq 128(%rsp),%rax + pxor %xmm14,%xmm6 + movl %edx,%r10d + + call _bsaes_decrypt8 + + pxor 0(%rsp),%xmm15 + pxor 16(%rsp),%xmm0 + movdqu %xmm15,0(%r13) + pxor 32(%rsp),%xmm5 + movdqu %xmm0,16(%r13) + pxor 48(%rsp),%xmm3 + movdqu %xmm5,32(%r13) + pxor 64(%rsp),%xmm1 + movdqu %xmm3,48(%r13) + pxor 80(%rsp),%xmm6 + movdqu %xmm1,64(%r13) + pxor 96(%rsp),%xmm2 + movdqu %xmm6,80(%r13) + pxor 112(%rsp),%xmm4 + movdqu %xmm2,96(%r13) + movdqu %xmm4,112(%r13) + leaq 128(%r13),%r13 + + movdqa 112(%rsp),%xmm6 + pxor %xmm14,%xmm14 + movdqa .Lxts_magic(%rip),%xmm12 + pcmpgtd %xmm6,%xmm14 + pshufd $0x13,%xmm14,%xmm13 + pxor %xmm14,%xmm14 + paddq %xmm6,%xmm6 + pand %xmm12,%xmm13 + pcmpgtd %xmm6,%xmm14 + pxor %xmm13,%xmm6 + + subq $0x80,%r14 + jnc .Lxts_dec_loop + +.Lxts_dec_short: + addq $0x80,%r14 + jz .Lxts_dec_done + pshufd $0x13,%xmm14,%xmm13 + pxor %xmm14,%xmm14 + movdqa %xmm6,%xmm15 + movdqa %xmm6,0(%rsp) + paddq %xmm6,%xmm6 + pand %xmm12,%xmm13 + pcmpgtd %xmm6,%xmm14 + pxor %xmm13,%xmm6 + pshufd $0x13,%xmm14,%xmm13 + pxor %xmm14,%xmm14 + movdqa %xmm6,%xmm0 + movdqa %xmm6,16(%rsp) + paddq %xmm6,%xmm6 + pand %xmm12,%xmm13 + pcmpgtd %xmm6,%xmm14 + pxor %xmm13,%xmm6 + movdqu 0(%r12),%xmm7 + cmpq $16,%r14 + je .Lxts_dec_1 + pshufd $0x13,%xmm14,%xmm13 + pxor %xmm14,%xmm14 + movdqa %xmm6,%xmm1 + movdqa %xmm6,32(%rsp) + paddq %xmm6,%xmm6 + pand %xmm12,%xmm13 + pcmpgtd %xmm6,%xmm14 + pxor %xmm13,%xmm6 + movdqu 16(%r12),%xmm8 + cmpq $32,%r14 + je .Lxts_dec_2 + pxor %xmm7,%xmm15 + pshufd $0x13,%xmm14,%xmm13 + pxor %xmm14,%xmm14 + movdqa %xmm6,%xmm2 + movdqa %xmm6,48(%rsp) + paddq %xmm6,%xmm6 + pand %xmm12,%xmm13 + pcmpgtd %xmm6,%xmm14 + pxor %xmm13,%xmm6 + movdqu 32(%r12),%xmm9 + cmpq $48,%r14 + je .Lxts_dec_3 + pxor %xmm8,%xmm0 + pshufd $0x13,%xmm14,%xmm13 + pxor %xmm14,%xmm14 + movdqa %xmm6,%xmm3 + movdqa %xmm6,64(%rsp) + paddq %xmm6,%xmm6 + pand %xmm12,%xmm13 + pcmpgtd %xmm6,%xmm14 + pxor %xmm13,%xmm6 + movdqu 48(%r12),%xmm10 + cmpq $64,%r14 + je .Lxts_dec_4 + pxor %xmm9,%xmm1 + pshufd $0x13,%xmm14,%xmm13 + pxor %xmm14,%xmm14 + movdqa %xmm6,%xmm4 + movdqa %xmm6,80(%rsp) + paddq %xmm6,%xmm6 + pand %xmm12,%xmm13 + pcmpgtd %xmm6,%xmm14 + pxor %xmm13,%xmm6 + movdqu 64(%r12),%xmm11 + cmpq $80,%r14 + je .Lxts_dec_5 + pxor %xmm10,%xmm2 + pshufd $0x13,%xmm14,%xmm13 + pxor %xmm14,%xmm14 + movdqa %xmm6,%xmm5 + movdqa %xmm6,96(%rsp) + paddq %xmm6,%xmm6 + pand %xmm12,%xmm13 + pcmpgtd %xmm6,%xmm14 + pxor %xmm13,%xmm6 + movdqu 80(%r12),%xmm12 + cmpq $96,%r14 + je .Lxts_dec_6 + pxor %xmm11,%xmm3 + movdqu 96(%r12),%xmm13 + pxor %xmm12,%xmm4 + movdqa %xmm6,112(%rsp) + leaq 112(%r12),%r12 + pxor %xmm13,%xmm5 + leaq 128(%rsp),%rax + movl %edx,%r10d + + call _bsaes_decrypt8 + + pxor 0(%rsp),%xmm15 + pxor 16(%rsp),%xmm0 + movdqu %xmm15,0(%r13) + pxor 32(%rsp),%xmm5 + movdqu %xmm0,16(%r13) + pxor 48(%rsp),%xmm3 + movdqu %xmm5,32(%r13) + pxor 64(%rsp),%xmm1 + movdqu %xmm3,48(%r13) + pxor 80(%rsp),%xmm6 + movdqu %xmm1,64(%r13) + pxor 96(%rsp),%xmm2 + movdqu %xmm6,80(%r13) + movdqu %xmm2,96(%r13) + leaq 112(%r13),%r13 + + movdqa 112(%rsp),%xmm6 + jmp .Lxts_dec_done +.align 16 +.Lxts_dec_6: + pxor %xmm11,%xmm3 + leaq 96(%r12),%r12 + pxor %xmm12,%xmm4 + leaq 128(%rsp),%rax + movl %edx,%r10d + + call _bsaes_decrypt8 + + pxor 0(%rsp),%xmm15 + pxor 16(%rsp),%xmm0 + movdqu %xmm15,0(%r13) + pxor 32(%rsp),%xmm5 + movdqu %xmm0,16(%r13) + pxor 48(%rsp),%xmm3 + movdqu %xmm5,32(%r13) + pxor 64(%rsp),%xmm1 + movdqu %xmm3,48(%r13) + pxor 80(%rsp),%xmm6 + movdqu %xmm1,64(%r13) + movdqu %xmm6,80(%r13) + leaq 96(%r13),%r13 + + movdqa 96(%rsp),%xmm6 + jmp .Lxts_dec_done +.align 16 +.Lxts_dec_5: + pxor %xmm10,%xmm2 + leaq 80(%r12),%r12 + pxor %xmm11,%xmm3 + leaq 128(%rsp),%rax + movl %edx,%r10d + + call _bsaes_decrypt8 + + pxor 0(%rsp),%xmm15 + pxor 16(%rsp),%xmm0 + movdqu %xmm15,0(%r13) + pxor 32(%rsp),%xmm5 + movdqu %xmm0,16(%r13) + pxor 48(%rsp),%xmm3 + movdqu %xmm5,32(%r13) + pxor 64(%rsp),%xmm1 + movdqu %xmm3,48(%r13) + movdqu %xmm1,64(%r13) + leaq 80(%r13),%r13 + + movdqa 80(%rsp),%xmm6 + jmp .Lxts_dec_done +.align 16 +.Lxts_dec_4: + pxor %xmm9,%xmm1 + leaq 64(%r12),%r12 + pxor %xmm10,%xmm2 + leaq 128(%rsp),%rax + movl %edx,%r10d + + call _bsaes_decrypt8 + + pxor 0(%rsp),%xmm15 + pxor 16(%rsp),%xmm0 + movdqu %xmm15,0(%r13) + pxor 32(%rsp),%xmm5 + movdqu %xmm0,16(%r13) + pxor 48(%rsp),%xmm3 + movdqu %xmm5,32(%r13) + movdqu %xmm3,48(%r13) + leaq 64(%r13),%r13 + + movdqa 64(%rsp),%xmm6 + jmp .Lxts_dec_done +.align 16 +.Lxts_dec_3: + pxor %xmm8,%xmm0 + leaq 48(%r12),%r12 + pxor %xmm9,%xmm1 + leaq 128(%rsp),%rax + movl %edx,%r10d + + call _bsaes_decrypt8 + + pxor 0(%rsp),%xmm15 + pxor 16(%rsp),%xmm0 + movdqu %xmm15,0(%r13) + pxor 32(%rsp),%xmm5 + movdqu %xmm0,16(%r13) + movdqu %xmm5,32(%r13) + leaq 48(%r13),%r13 + + movdqa 48(%rsp),%xmm6 + jmp .Lxts_dec_done +.align 16 +.Lxts_dec_2: + pxor %xmm7,%xmm15 + leaq 32(%r12),%r12 + pxor %xmm8,%xmm0 + leaq 128(%rsp),%rax + movl %edx,%r10d + + call _bsaes_decrypt8 + + pxor 0(%rsp),%xmm15 + pxor 16(%rsp),%xmm0 + movdqu %xmm15,0(%r13) + movdqu %xmm0,16(%r13) + leaq 32(%r13),%r13 + + movdqa 32(%rsp),%xmm6 + jmp .Lxts_dec_done +.align 16 +.Lxts_dec_1: + pxor %xmm15,%xmm7 + leaq 16(%r12),%r12 + movdqa %xmm7,32(%rbp) + leaq 32(%rbp),%rdi + leaq 32(%rbp),%rsi + leaq (%r15),%rdx + call asm_AES_decrypt + pxor 32(%rbp),%xmm15 + + + + + + movdqu %xmm15,0(%r13) + leaq 16(%r13),%r13 + + movdqa 16(%rsp),%xmm6 + +.Lxts_dec_done: + andl $15,%ebx + jz .Lxts_dec_ret + + pxor %xmm14,%xmm14 + movdqa .Lxts_magic(%rip),%xmm12 + pcmpgtd %xmm6,%xmm14 + pshufd $0x13,%xmm14,%xmm13 + movdqa %xmm6,%xmm5 + paddq %xmm6,%xmm6 + pand %xmm12,%xmm13 + movdqu (%r12),%xmm15 + pxor %xmm13,%xmm6 + + leaq 32(%rbp),%rdi + pxor %xmm6,%xmm15 + leaq 32(%rbp),%rsi + movdqa %xmm15,32(%rbp) + leaq (%r15),%rdx + call asm_AES_decrypt + pxor 32(%rbp),%xmm6 + movq %r13,%rdx + movdqu %xmm6,(%r13) + +.Lxts_dec_steal: + movzbl 16(%r12),%eax + movzbl (%rdx),%ecx + leaq 1(%r12),%r12 + movb %al,(%rdx) + movb %cl,16(%rdx) + leaq 1(%rdx),%rdx + subl $1,%ebx + jnz .Lxts_dec_steal + + movdqu (%r13),%xmm15 + leaq 32(%rbp),%rdi + pxor %xmm5,%xmm15 + leaq 32(%rbp),%rsi + movdqa %xmm15,32(%rbp) + leaq (%r15),%rdx + call asm_AES_decrypt + pxor 32(%rbp),%xmm5 + movdqu %xmm5,(%r13) + +.Lxts_dec_ret: + leaq (%rsp),%rax + pxor %xmm0,%xmm0 +.Lxts_dec_bzero: + movdqa %xmm0,0(%rax) + movdqa %xmm0,16(%rax) + leaq 32(%rax),%rax + cmpq %rax,%rbp + ja .Lxts_dec_bzero + + leaq 120(%rbp),%rax +.cfi_def_cfa %rax,8 + movq -48(%rax),%r15 +.cfi_restore %r15 + movq -40(%rax),%r14 +.cfi_restore %r14 + movq -32(%rax),%r13 +.cfi_restore %r13 + movq -24(%rax),%r12 +.cfi_restore %r12 + movq -16(%rax),%rbx +.cfi_restore %rbx + movq -8(%rax),%rbp +.cfi_restore %rbp + leaq (%rax),%rsp +.cfi_def_cfa_register %rsp +.Lxts_dec_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size ossl_bsaes_xts_decrypt,.-ossl_bsaes_xts_decrypt +.type _bsaes_const,@object +.align 64 +_bsaes_const: +.LM0ISR: +.quad 0x0a0e0206070b0f03, 0x0004080c0d010509 +.LISRM0: +.quad 0x01040b0e0205080f, 0x0306090c00070a0d +.LISR: +.quad 0x0504070602010003, 0x0f0e0d0c080b0a09 +.LBS0: +.quad 0x5555555555555555, 0x5555555555555555 +.LBS1: +.quad 0x3333333333333333, 0x3333333333333333 +.LBS2: +.quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f +.LSR: +.quad 0x0504070600030201, 0x0f0e0d0c0a09080b +.LSRM0: +.quad 0x0304090e00050a0f, 0x01060b0c0207080d +.LM0SR: +.quad 0x0a0e02060f03070b, 0x0004080c05090d01 +.LSWPUP: +.quad 0x0706050403020100, 0x0c0d0e0f0b0a0908 +.LSWPUPM0SR: +.quad 0x0a0d02060c03070b, 0x0004080f05090e01 +.LADD1: +.quad 0x0000000000000000, 0x0000000100000000 +.LADD2: +.quad 0x0000000000000000, 0x0000000200000000 +.LADD3: +.quad 0x0000000000000000, 0x0000000300000000 +.LADD4: +.quad 0x0000000000000000, 0x0000000400000000 +.LADD5: +.quad 0x0000000000000000, 0x0000000500000000 +.LADD6: +.quad 0x0000000000000000, 0x0000000600000000 +.LADD7: +.quad 0x0000000000000000, 0x0000000700000000 +.LADD8: +.quad 0x0000000000000000, 0x0000000800000000 +.Lxts_magic: +.long 0x87,0,1,0 +.Lmasks: +.quad 0x0101010101010101, 0x0101010101010101 +.quad 0x0202020202020202, 0x0202020202020202 +.quad 0x0404040404040404, 0x0404040404040404 +.quad 0x0808080808080808, 0x0808080808080808 +.LM0: +.quad 0x02060a0e03070b0f, 0x0004080c0105090d +.L63: +.quad 0x6363636363636363, 0x6363636363636363 +.byte 66,105,116,45,115,108,105,99,101,100,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,69,109,105,108,105,97,32,75,195,164,115,112,101,114,44,32,80,101,116,101,114,32,83,99,104,119,97,98,101,44,32,65,110,100,121,32,80,111,108,121,97,107,111,118,0 +.align 64 +.size _bsaes_const,.-_bsaes_const + .section ".note.gnu.property", "a" + .p2align 3 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + # "GNU" encoded with .byte, since .asciz isn't supported + # on Solaris. + .byte 0x47 + .byte 0x4e + .byte 0x55 + .byte 0 +1: + .p2align 3 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 3 +4: diff --git a/sys/crypto/openssl/amd64/chacha-x86_64.S b/sys/crypto/openssl/amd64/chacha-x86_64.S --- a/sys/crypto/openssl/amd64/chacha-x86_64.S +++ b/sys/crypto/openssl/amd64/chacha-x86_64.S @@ -2192,3 +2192,24 @@ .byte 0xf3,0xc3 .cfi_endproc .size ChaCha20_8x,.-ChaCha20_8x + .section ".note.gnu.property", "a" + .p2align 3 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + # "GNU" encoded with .byte, since .asciz isn't supported + # on Solaris. + .byte 0x47 + .byte 0x4e + .byte 0x55 + .byte 0 +1: + .p2align 3 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 3 +4: diff --git a/sys/crypto/openssl/amd64/cmll-x86_64.S b/sys/crypto/openssl/amd64/cmll-x86_64.S --- a/sys/crypto/openssl/amd64/cmll-x86_64.S +++ b/sys/crypto/openssl/amd64/cmll-x86_64.S @@ -1669,6 +1669,7 @@ .align 16 Camellia_cbc_encrypt: .cfi_startproc +.byte 243,15,30,250 cmpq $0,%rdx je .Lcbc_abort pushq %rbx @@ -1923,3 +1924,24 @@ .size Camellia_cbc_encrypt,.-Camellia_cbc_encrypt .byte 67,97,109,101,108,108,105,97,32,102,111,114,32,120,56,54,95,54,52,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 + .section ".note.gnu.property", "a" + .p2align 3 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + # "GNU" encoded with .byte, since .asciz isn't supported + # on Solaris. + .byte 0x47 + .byte 0x4e + .byte 0x55 + .byte 0 +1: + .p2align 3 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 3 +4: diff --git a/sys/crypto/openssl/amd64/e_padlock-x86_64.S b/sys/crypto/openssl/amd64/e_padlock-x86_64.S --- a/sys/crypto/openssl/amd64/e_padlock-x86_64.S +++ b/sys/crypto/openssl/amd64/e_padlock-x86_64.S @@ -1036,3 +1036,24 @@ .align 8 .Lpadlock_saved_context: .quad 0 + .section ".note.gnu.property", "a" + .p2align 3 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + # "GNU" encoded with .byte, since .asciz isn't supported + # on Solaris. + .byte 0x47 + .byte 0x4e + .byte 0x55 + .byte 0 +1: + .p2align 3 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 3 +4: diff --git a/sys/crypto/openssl/amd64/ecp_nistz256-x86_64.S b/sys/crypto/openssl/amd64/ecp_nistz256-x86_64.S --- a/sys/crypto/openssl/amd64/ecp_nistz256-x86_64.S +++ b/sys/crypto/openssl/amd64/ecp_nistz256-x86_64.S @@ -7342,3 +7342,24 @@ .byte 0xf3,0xc3 .cfi_endproc .size ecp_nistz256_point_add_affinex,.-ecp_nistz256_point_add_affinex + .section ".note.gnu.property", "a" + .p2align 3 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + # "GNU" encoded with .byte, since .asciz isn't supported + # on Solaris. + .byte 0x47 + .byte 0x4e + .byte 0x55 + .byte 0 +1: + .p2align 3 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 3 +4: diff --git a/sys/crypto/openssl/amd64/ghash-x86_64.S b/sys/crypto/openssl/amd64/ghash-x86_64.S --- a/sys/crypto/openssl/amd64/ghash-x86_64.S +++ b/sys/crypto/openssl/amd64/ghash-x86_64.S @@ -7,6 +7,7 @@ .align 16 gcm_gmult_4bit: .cfi_startproc +.byte 243,15,30,250 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-16 @@ -118,6 +119,7 @@ .align 16 gcm_ghash_4bit: .cfi_startproc +.byte 243,15,30,250 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-16 @@ -865,6 +867,7 @@ .align 16 gcm_gmult_clmul: .cfi_startproc +.byte 243,15,30,250 .L_gmult_clmul: movdqu (%rdi),%xmm0 movdqa .Lbswap_mask(%rip),%xmm5 @@ -918,6 +921,7 @@ .align 32 gcm_ghash_clmul: .cfi_startproc +.byte 243,15,30,250 .L_ghash_clmul: movdqa .Lbswap_mask(%rip),%xmm10 @@ -1412,6 +1416,7 @@ .align 32 gcm_gmult_avx: .cfi_startproc +.byte 243,15,30,250 jmp .L_gmult_clmul .cfi_endproc .size gcm_gmult_avx,.-gcm_gmult_avx @@ -1420,6 +1425,7 @@ .align 32 gcm_ghash_avx: .cfi_startproc +.byte 243,15,30,250 vzeroupper vmovdqu (%rdi),%xmm10 @@ -1846,3 +1852,24 @@ .byte 71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 64 + .section ".note.gnu.property", "a" + .p2align 3 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + # "GNU" encoded with .byte, since .asciz isn't supported + # on Solaris. + .byte 0x47 + .byte 0x4e + .byte 0x55 + .byte 0 +1: + .p2align 3 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 3 +4: diff --git a/sys/crypto/openssl/amd64/keccak1600-x86_64.S b/sys/crypto/openssl/amd64/keccak1600-x86_64.S --- a/sys/crypto/openssl/amd64/keccak1600-x86_64.S +++ b/sys/crypto/openssl/amd64/keccak1600-x86_64.S @@ -523,3 +523,24 @@ .quad 0x8000000080008008 .size iotas,.-iotas .byte 75,101,99,99,97,107,45,49,54,48,48,32,97,98,115,111,114,98,32,97,110,100,32,115,113,117,101,101,122,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 + .section ".note.gnu.property", "a" + .p2align 3 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + # "GNU" encoded with .byte, since .asciz isn't supported + # on Solaris. + .byte 0x47 + .byte 0x4e + .byte 0x55 + .byte 0 +1: + .p2align 3 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 3 +4: diff --git a/sys/crypto/openssl/amd64/md5-x86_64.S b/sys/crypto/openssl/amd64/md5-x86_64.S --- a/sys/crypto/openssl/amd64/md5-x86_64.S +++ b/sys/crypto/openssl/amd64/md5-x86_64.S @@ -2,9 +2,9 @@ .text .align 16 -.globl md5_block_asm_data_order -.type md5_block_asm_data_order,@function -md5_block_asm_data_order: +.globl ossl_md5_block_asm_data_order +.type ossl_md5_block_asm_data_order,@function +ossl_md5_block_asm_data_order: .cfi_startproc pushq %rbp .cfi_adjust_cfa_offset 8 @@ -681,4 +681,25 @@ .Lepilogue: .byte 0xf3,0xc3 .cfi_endproc -.size md5_block_asm_data_order,.-md5_block_asm_data_order +.size ossl_md5_block_asm_data_order,.-ossl_md5_block_asm_data_order + .section ".note.gnu.property", "a" + .p2align 3 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + # "GNU" encoded with .byte, since .asciz isn't supported + # on Solaris. + .byte 0x47 + .byte 0x4e + .byte 0x55 + .byte 0 +1: + .p2align 3 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 3 +4: diff --git a/sys/crypto/openssl/amd64/poly1305-x86_64.S b/sys/crypto/openssl/amd64/poly1305-x86_64.S --- a/sys/crypto/openssl/amd64/poly1305-x86_64.S +++ b/sys/crypto/openssl/amd64/poly1305-x86_64.S @@ -2067,3 +2067,24 @@ .byte 0xf3,0xc3 .cfi_endproc .size xor128_decrypt_n_pad,.-xor128_decrypt_n_pad + .section ".note.gnu.property", "a" + .p2align 3 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + # "GNU" encoded with .byte, since .asciz isn't supported + # on Solaris. + .byte 0x47 + .byte 0x4e + .byte 0x55 + .byte 0 +1: + .p2align 3 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 3 +4: diff --git a/sys/crypto/openssl/amd64/rc4-md5-x86_64.S b/sys/crypto/openssl/amd64/rc4-md5-x86_64.S --- a/sys/crypto/openssl/amd64/rc4-md5-x86_64.S +++ b/sys/crypto/openssl/amd64/rc4-md5-x86_64.S @@ -1280,3 +1280,24 @@ .byte 0xf3,0xc3 .cfi_endproc .size rc4_md5_enc,.-rc4_md5_enc + .section ".note.gnu.property", "a" + .p2align 3 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + # "GNU" encoded with .byte, since .asciz isn't supported + # on Solaris. + .byte 0x47 + .byte 0x4e + .byte 0x55 + .byte 0 +1: + .p2align 3 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 3 +4: diff --git a/sys/crypto/openssl/amd64/rc4-x86_64.S b/sys/crypto/openssl/amd64/rc4-x86_64.S --- a/sys/crypto/openssl/amd64/rc4-x86_64.S +++ b/sys/crypto/openssl/amd64/rc4-x86_64.S @@ -7,6 +7,7 @@ .align 16 RC4: .cfi_startproc +.byte 243,15,30,250 orq %rsi,%rsi jne .Lentry .byte 0xf3,0xc3 @@ -536,6 +537,7 @@ .align 16 RC4_set_key: .cfi_startproc +.byte 243,15,30,250 leaq 8(%rdi),%rdi leaq (%rdx,%rsi,1),%rdx negq %rsi @@ -610,6 +612,7 @@ .align 16 RC4_options: .cfi_startproc +.byte 243,15,30,250 leaq .Lopts(%rip),%rax movl OPENSSL_ia32cap_P(%rip),%edx btl $20,%edx @@ -631,3 +634,24 @@ .byte 82,67,52,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 64 .size RC4_options,.-RC4_options + .section ".note.gnu.property", "a" + .p2align 3 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + # "GNU" encoded with .byte, since .asciz isn't supported + # on Solaris. + .byte 0x47 + .byte 0x4e + .byte 0x55 + .byte 0 +1: + .p2align 3 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 3 +4: diff --git a/sys/crypto/openssl/amd64/rsaz-avx2.S b/sys/crypto/openssl/amd64/rsaz-avx2.S --- a/sys/crypto/openssl/amd64/rsaz-avx2.S +++ b/sys/crypto/openssl/amd64/rsaz-avx2.S @@ -1743,3 +1743,24 @@ .long 2,2,2,2, 3,3,3,3 .long 4,4,4,4, 4,4,4,4 .align 64 + .section ".note.gnu.property", "a" + .p2align 3 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + # "GNU" encoded with .byte, since .asciz isn't supported + # on Solaris. + .byte 0x47 + .byte 0x4e + .byte 0x55 + .byte 0 +1: + .p2align 3 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 3 +4: diff --git a/sys/crypto/openssl/amd64/rsaz-avx512.S b/sys/crypto/openssl/amd64/rsaz-avx512.S new file mode 100644 --- /dev/null +++ b/sys/crypto/openssl/amd64/rsaz-avx512.S @@ -0,0 +1,902 @@ +/* Do not modify. This file is auto-generated from rsaz-avx512.pl. */ + +.globl ossl_rsaz_avx512ifma_eligible +.type ossl_rsaz_avx512ifma_eligible,@function +.align 32 +ossl_rsaz_avx512ifma_eligible: + movl OPENSSL_ia32cap_P+8(%rip),%ecx + xorl %eax,%eax + andl $2149777408,%ecx + cmpl $2149777408,%ecx + cmovel %ecx,%eax + .byte 0xf3,0xc3 +.size ossl_rsaz_avx512ifma_eligible, .-ossl_rsaz_avx512ifma_eligible +.text + +.globl ossl_rsaz_amm52x20_x1_256 +.type ossl_rsaz_amm52x20_x1_256,@function +.align 32 +ossl_rsaz_amm52x20_x1_256: +.cfi_startproc +.byte 243,15,30,250 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 +.Lrsaz_amm52x20_x1_256_body: + + + vpxord %ymm0,%ymm0,%ymm0 + vmovdqa64 %ymm0,%ymm1 + vmovdqa64 %ymm0,%ymm16 + vmovdqa64 %ymm0,%ymm17 + vmovdqa64 %ymm0,%ymm18 + vmovdqa64 %ymm0,%ymm19 + + xorl %r9d,%r9d + + movq %rdx,%r11 + movq $0xfffffffffffff,%rax + + + movl $5,%ebx + +.align 32 +.Lloop5: + movq 0(%r11),%r13 + + vpbroadcastq %r13,%ymm3 + movq 0(%rsi),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + movq %r12,%r10 + adcq $0,%r10 + + movq %r8,%r13 + imulq %r9,%r13 + andq %rax,%r13 + + vpbroadcastq %r13,%ymm4 + movq 0(%rcx),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + adcq %r12,%r10 + + shrq $52,%r9 + salq $12,%r10 + orq %r10,%r9 + + vpmadd52luq 0(%rsi),%ymm3,%ymm1 + vpmadd52luq 32(%rsi),%ymm3,%ymm16 + vpmadd52luq 64(%rsi),%ymm3,%ymm17 + vpmadd52luq 96(%rsi),%ymm3,%ymm18 + vpmadd52luq 128(%rsi),%ymm3,%ymm19 + + vpmadd52luq 0(%rcx),%ymm4,%ymm1 + vpmadd52luq 32(%rcx),%ymm4,%ymm16 + vpmadd52luq 64(%rcx),%ymm4,%ymm17 + vpmadd52luq 96(%rcx),%ymm4,%ymm18 + vpmadd52luq 128(%rcx),%ymm4,%ymm19 + + + valignq $1,%ymm1,%ymm16,%ymm1 + valignq $1,%ymm16,%ymm17,%ymm16 + valignq $1,%ymm17,%ymm18,%ymm17 + valignq $1,%ymm18,%ymm19,%ymm18 + valignq $1,%ymm19,%ymm0,%ymm19 + + vmovq %xmm1,%r13 + addq %r13,%r9 + + vpmadd52huq 0(%rsi),%ymm3,%ymm1 + vpmadd52huq 32(%rsi),%ymm3,%ymm16 + vpmadd52huq 64(%rsi),%ymm3,%ymm17 + vpmadd52huq 96(%rsi),%ymm3,%ymm18 + vpmadd52huq 128(%rsi),%ymm3,%ymm19 + + vpmadd52huq 0(%rcx),%ymm4,%ymm1 + vpmadd52huq 32(%rcx),%ymm4,%ymm16 + vpmadd52huq 64(%rcx),%ymm4,%ymm17 + vpmadd52huq 96(%rcx),%ymm4,%ymm18 + vpmadd52huq 128(%rcx),%ymm4,%ymm19 + movq 8(%r11),%r13 + + vpbroadcastq %r13,%ymm3 + movq 0(%rsi),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + movq %r12,%r10 + adcq $0,%r10 + + movq %r8,%r13 + imulq %r9,%r13 + andq %rax,%r13 + + vpbroadcastq %r13,%ymm4 + movq 0(%rcx),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + adcq %r12,%r10 + + shrq $52,%r9 + salq $12,%r10 + orq %r10,%r9 + + vpmadd52luq 0(%rsi),%ymm3,%ymm1 + vpmadd52luq 32(%rsi),%ymm3,%ymm16 + vpmadd52luq 64(%rsi),%ymm3,%ymm17 + vpmadd52luq 96(%rsi),%ymm3,%ymm18 + vpmadd52luq 128(%rsi),%ymm3,%ymm19 + + vpmadd52luq 0(%rcx),%ymm4,%ymm1 + vpmadd52luq 32(%rcx),%ymm4,%ymm16 + vpmadd52luq 64(%rcx),%ymm4,%ymm17 + vpmadd52luq 96(%rcx),%ymm4,%ymm18 + vpmadd52luq 128(%rcx),%ymm4,%ymm19 + + + valignq $1,%ymm1,%ymm16,%ymm1 + valignq $1,%ymm16,%ymm17,%ymm16 + valignq $1,%ymm17,%ymm18,%ymm17 + valignq $1,%ymm18,%ymm19,%ymm18 + valignq $1,%ymm19,%ymm0,%ymm19 + + vmovq %xmm1,%r13 + addq %r13,%r9 + + vpmadd52huq 0(%rsi),%ymm3,%ymm1 + vpmadd52huq 32(%rsi),%ymm3,%ymm16 + vpmadd52huq 64(%rsi),%ymm3,%ymm17 + vpmadd52huq 96(%rsi),%ymm3,%ymm18 + vpmadd52huq 128(%rsi),%ymm3,%ymm19 + + vpmadd52huq 0(%rcx),%ymm4,%ymm1 + vpmadd52huq 32(%rcx),%ymm4,%ymm16 + vpmadd52huq 64(%rcx),%ymm4,%ymm17 + vpmadd52huq 96(%rcx),%ymm4,%ymm18 + vpmadd52huq 128(%rcx),%ymm4,%ymm19 + movq 16(%r11),%r13 + + vpbroadcastq %r13,%ymm3 + movq 0(%rsi),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + movq %r12,%r10 + adcq $0,%r10 + + movq %r8,%r13 + imulq %r9,%r13 + andq %rax,%r13 + + vpbroadcastq %r13,%ymm4 + movq 0(%rcx),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + adcq %r12,%r10 + + shrq $52,%r9 + salq $12,%r10 + orq %r10,%r9 + + vpmadd52luq 0(%rsi),%ymm3,%ymm1 + vpmadd52luq 32(%rsi),%ymm3,%ymm16 + vpmadd52luq 64(%rsi),%ymm3,%ymm17 + vpmadd52luq 96(%rsi),%ymm3,%ymm18 + vpmadd52luq 128(%rsi),%ymm3,%ymm19 + + vpmadd52luq 0(%rcx),%ymm4,%ymm1 + vpmadd52luq 32(%rcx),%ymm4,%ymm16 + vpmadd52luq 64(%rcx),%ymm4,%ymm17 + vpmadd52luq 96(%rcx),%ymm4,%ymm18 + vpmadd52luq 128(%rcx),%ymm4,%ymm19 + + + valignq $1,%ymm1,%ymm16,%ymm1 + valignq $1,%ymm16,%ymm17,%ymm16 + valignq $1,%ymm17,%ymm18,%ymm17 + valignq $1,%ymm18,%ymm19,%ymm18 + valignq $1,%ymm19,%ymm0,%ymm19 + + vmovq %xmm1,%r13 + addq %r13,%r9 + + vpmadd52huq 0(%rsi),%ymm3,%ymm1 + vpmadd52huq 32(%rsi),%ymm3,%ymm16 + vpmadd52huq 64(%rsi),%ymm3,%ymm17 + vpmadd52huq 96(%rsi),%ymm3,%ymm18 + vpmadd52huq 128(%rsi),%ymm3,%ymm19 + + vpmadd52huq 0(%rcx),%ymm4,%ymm1 + vpmadd52huq 32(%rcx),%ymm4,%ymm16 + vpmadd52huq 64(%rcx),%ymm4,%ymm17 + vpmadd52huq 96(%rcx),%ymm4,%ymm18 + vpmadd52huq 128(%rcx),%ymm4,%ymm19 + movq 24(%r11),%r13 + + vpbroadcastq %r13,%ymm3 + movq 0(%rsi),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + movq %r12,%r10 + adcq $0,%r10 + + movq %r8,%r13 + imulq %r9,%r13 + andq %rax,%r13 + + vpbroadcastq %r13,%ymm4 + movq 0(%rcx),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + adcq %r12,%r10 + + shrq $52,%r9 + salq $12,%r10 + orq %r10,%r9 + + vpmadd52luq 0(%rsi),%ymm3,%ymm1 + vpmadd52luq 32(%rsi),%ymm3,%ymm16 + vpmadd52luq 64(%rsi),%ymm3,%ymm17 + vpmadd52luq 96(%rsi),%ymm3,%ymm18 + vpmadd52luq 128(%rsi),%ymm3,%ymm19 + + vpmadd52luq 0(%rcx),%ymm4,%ymm1 + vpmadd52luq 32(%rcx),%ymm4,%ymm16 + vpmadd52luq 64(%rcx),%ymm4,%ymm17 + vpmadd52luq 96(%rcx),%ymm4,%ymm18 + vpmadd52luq 128(%rcx),%ymm4,%ymm19 + + + valignq $1,%ymm1,%ymm16,%ymm1 + valignq $1,%ymm16,%ymm17,%ymm16 + valignq $1,%ymm17,%ymm18,%ymm17 + valignq $1,%ymm18,%ymm19,%ymm18 + valignq $1,%ymm19,%ymm0,%ymm19 + + vmovq %xmm1,%r13 + addq %r13,%r9 + + vpmadd52huq 0(%rsi),%ymm3,%ymm1 + vpmadd52huq 32(%rsi),%ymm3,%ymm16 + vpmadd52huq 64(%rsi),%ymm3,%ymm17 + vpmadd52huq 96(%rsi),%ymm3,%ymm18 + vpmadd52huq 128(%rsi),%ymm3,%ymm19 + + vpmadd52huq 0(%rcx),%ymm4,%ymm1 + vpmadd52huq 32(%rcx),%ymm4,%ymm16 + vpmadd52huq 64(%rcx),%ymm4,%ymm17 + vpmadd52huq 96(%rcx),%ymm4,%ymm18 + vpmadd52huq 128(%rcx),%ymm4,%ymm19 + leaq 32(%r11),%r11 + decl %ebx + jne .Lloop5 + + vmovdqa64 .Lmask52x4(%rip),%ymm4 + + vpbroadcastq %r9,%ymm3 + vpblendd $3,%ymm3,%ymm1,%ymm1 + + + + vpsrlq $52,%ymm1,%ymm24 + vpsrlq $52,%ymm16,%ymm25 + vpsrlq $52,%ymm17,%ymm26 + vpsrlq $52,%ymm18,%ymm27 + vpsrlq $52,%ymm19,%ymm28 + + + valignq $3,%ymm27,%ymm28,%ymm28 + valignq $3,%ymm26,%ymm27,%ymm27 + valignq $3,%ymm25,%ymm26,%ymm26 + valignq $3,%ymm24,%ymm25,%ymm25 + valignq $3,%ymm0,%ymm24,%ymm24 + + + vpandq %ymm4,%ymm1,%ymm1 + vpandq %ymm4,%ymm16,%ymm16 + vpandq %ymm4,%ymm17,%ymm17 + vpandq %ymm4,%ymm18,%ymm18 + vpandq %ymm4,%ymm19,%ymm19 + + + vpaddq %ymm24,%ymm1,%ymm1 + vpaddq %ymm25,%ymm16,%ymm16 + vpaddq %ymm26,%ymm17,%ymm17 + vpaddq %ymm27,%ymm18,%ymm18 + vpaddq %ymm28,%ymm19,%ymm19 + + + + vpcmpuq $1,%ymm1,%ymm4,%k1 + vpcmpuq $1,%ymm16,%ymm4,%k2 + vpcmpuq $1,%ymm17,%ymm4,%k3 + vpcmpuq $1,%ymm18,%ymm4,%k4 + vpcmpuq $1,%ymm19,%ymm4,%k5 + kmovb %k1,%r14d + kmovb %k2,%r13d + kmovb %k3,%r12d + kmovb %k4,%r11d + kmovb %k5,%r10d + + + vpcmpuq $0,%ymm1,%ymm4,%k1 + vpcmpuq $0,%ymm16,%ymm4,%k2 + vpcmpuq $0,%ymm17,%ymm4,%k3 + vpcmpuq $0,%ymm18,%ymm4,%k4 + vpcmpuq $0,%ymm19,%ymm4,%k5 + kmovb %k1,%r9d + kmovb %k2,%r8d + kmovb %k3,%ebx + kmovb %k4,%ecx + kmovb %k5,%edx + + + + shlb $4,%r13b + orb %r13b,%r14b + shlb $4,%r11b + orb %r11b,%r12b + + addb %r14b,%r14b + adcb %r12b,%r12b + adcb %r10b,%r10b + + shlb $4,%r8b + orb %r8b,%r9b + shlb $4,%cl + orb %cl,%bl + + addb %r9b,%r14b + adcb %bl,%r12b + adcb %dl,%r10b + + xorb %r9b,%r14b + xorb %bl,%r12b + xorb %dl,%r10b + + kmovb %r14d,%k1 + shrb $4,%r14b + kmovb %r14d,%k2 + kmovb %r12d,%k3 + shrb $4,%r12b + kmovb %r12d,%k4 + kmovb %r10d,%k5 + + + vpsubq %ymm4,%ymm1,%ymm1{%k1} + vpsubq %ymm4,%ymm16,%ymm16{%k2} + vpsubq %ymm4,%ymm17,%ymm17{%k3} + vpsubq %ymm4,%ymm18,%ymm18{%k4} + vpsubq %ymm4,%ymm19,%ymm19{%k5} + + vpandq %ymm4,%ymm1,%ymm1 + vpandq %ymm4,%ymm16,%ymm16 + vpandq %ymm4,%ymm17,%ymm17 + vpandq %ymm4,%ymm18,%ymm18 + vpandq %ymm4,%ymm19,%ymm19 + + vmovdqu64 %ymm1,(%rdi) + vmovdqu64 %ymm16,32(%rdi) + vmovdqu64 %ymm17,64(%rdi) + vmovdqu64 %ymm18,96(%rdi) + vmovdqu64 %ymm19,128(%rdi) + + vzeroupper + movq 0(%rsp),%r15 +.cfi_restore %r15 + movq 8(%rsp),%r14 +.cfi_restore %r14 + movq 16(%rsp),%r13 +.cfi_restore %r13 + movq 24(%rsp),%r12 +.cfi_restore %r12 + movq 32(%rsp),%rbp +.cfi_restore %rbp + movq 40(%rsp),%rbx +.cfi_restore %rbx + leaq 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lrsaz_amm52x20_x1_256_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size ossl_rsaz_amm52x20_x1_256, .-ossl_rsaz_amm52x20_x1_256 +.data +.align 32 +.Lmask52x4: +.quad 0xfffffffffffff +.quad 0xfffffffffffff +.quad 0xfffffffffffff +.quad 0xfffffffffffff +.text + +.globl ossl_rsaz_amm52x20_x2_256 +.type ossl_rsaz_amm52x20_x2_256,@function +.align 32 +ossl_rsaz_amm52x20_x2_256: +.cfi_startproc +.byte 243,15,30,250 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 +.Lrsaz_amm52x20_x2_256_body: + + + vpxord %ymm0,%ymm0,%ymm0 + vmovdqa64 %ymm0,%ymm1 + vmovdqa64 %ymm0,%ymm16 + vmovdqa64 %ymm0,%ymm17 + vmovdqa64 %ymm0,%ymm18 + vmovdqa64 %ymm0,%ymm19 + vmovdqa64 %ymm0,%ymm2 + vmovdqa64 %ymm0,%ymm20 + vmovdqa64 %ymm0,%ymm21 + vmovdqa64 %ymm0,%ymm22 + vmovdqa64 %ymm0,%ymm23 + + xorl %r9d,%r9d + xorl %r15d,%r15d + + movq %rdx,%r11 + movq $0xfffffffffffff,%rax + + movl $20,%ebx + +.align 32 +.Lloop20: + movq 0(%r11),%r13 + + vpbroadcastq %r13,%ymm3 + movq 0(%rsi),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + movq %r12,%r10 + adcq $0,%r10 + + movq (%r8),%r13 + imulq %r9,%r13 + andq %rax,%r13 + + vpbroadcastq %r13,%ymm4 + movq 0(%rcx),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + adcq %r12,%r10 + + shrq $52,%r9 + salq $12,%r10 + orq %r10,%r9 + + vpmadd52luq 0(%rsi),%ymm3,%ymm1 + vpmadd52luq 32(%rsi),%ymm3,%ymm16 + vpmadd52luq 64(%rsi),%ymm3,%ymm17 + vpmadd52luq 96(%rsi),%ymm3,%ymm18 + vpmadd52luq 128(%rsi),%ymm3,%ymm19 + + vpmadd52luq 0(%rcx),%ymm4,%ymm1 + vpmadd52luq 32(%rcx),%ymm4,%ymm16 + vpmadd52luq 64(%rcx),%ymm4,%ymm17 + vpmadd52luq 96(%rcx),%ymm4,%ymm18 + vpmadd52luq 128(%rcx),%ymm4,%ymm19 + + + valignq $1,%ymm1,%ymm16,%ymm1 + valignq $1,%ymm16,%ymm17,%ymm16 + valignq $1,%ymm17,%ymm18,%ymm17 + valignq $1,%ymm18,%ymm19,%ymm18 + valignq $1,%ymm19,%ymm0,%ymm19 + + vmovq %xmm1,%r13 + addq %r13,%r9 + + vpmadd52huq 0(%rsi),%ymm3,%ymm1 + vpmadd52huq 32(%rsi),%ymm3,%ymm16 + vpmadd52huq 64(%rsi),%ymm3,%ymm17 + vpmadd52huq 96(%rsi),%ymm3,%ymm18 + vpmadd52huq 128(%rsi),%ymm3,%ymm19 + + vpmadd52huq 0(%rcx),%ymm4,%ymm1 + vpmadd52huq 32(%rcx),%ymm4,%ymm16 + vpmadd52huq 64(%rcx),%ymm4,%ymm17 + vpmadd52huq 96(%rcx),%ymm4,%ymm18 + vpmadd52huq 128(%rcx),%ymm4,%ymm19 + movq 160(%r11),%r13 + + vpbroadcastq %r13,%ymm3 + movq 160(%rsi),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r15 + movq %r12,%r10 + adcq $0,%r10 + + movq 8(%r8),%r13 + imulq %r15,%r13 + andq %rax,%r13 + + vpbroadcastq %r13,%ymm4 + movq 160(%rcx),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r15 + adcq %r12,%r10 + + shrq $52,%r15 + salq $12,%r10 + orq %r10,%r15 + + vpmadd52luq 160(%rsi),%ymm3,%ymm2 + vpmadd52luq 192(%rsi),%ymm3,%ymm20 + vpmadd52luq 224(%rsi),%ymm3,%ymm21 + vpmadd52luq 256(%rsi),%ymm3,%ymm22 + vpmadd52luq 288(%rsi),%ymm3,%ymm23 + + vpmadd52luq 160(%rcx),%ymm4,%ymm2 + vpmadd52luq 192(%rcx),%ymm4,%ymm20 + vpmadd52luq 224(%rcx),%ymm4,%ymm21 + vpmadd52luq 256(%rcx),%ymm4,%ymm22 + vpmadd52luq 288(%rcx),%ymm4,%ymm23 + + + valignq $1,%ymm2,%ymm20,%ymm2 + valignq $1,%ymm20,%ymm21,%ymm20 + valignq $1,%ymm21,%ymm22,%ymm21 + valignq $1,%ymm22,%ymm23,%ymm22 + valignq $1,%ymm23,%ymm0,%ymm23 + + vmovq %xmm2,%r13 + addq %r13,%r15 + + vpmadd52huq 160(%rsi),%ymm3,%ymm2 + vpmadd52huq 192(%rsi),%ymm3,%ymm20 + vpmadd52huq 224(%rsi),%ymm3,%ymm21 + vpmadd52huq 256(%rsi),%ymm3,%ymm22 + vpmadd52huq 288(%rsi),%ymm3,%ymm23 + + vpmadd52huq 160(%rcx),%ymm4,%ymm2 + vpmadd52huq 192(%rcx),%ymm4,%ymm20 + vpmadd52huq 224(%rcx),%ymm4,%ymm21 + vpmadd52huq 256(%rcx),%ymm4,%ymm22 + vpmadd52huq 288(%rcx),%ymm4,%ymm23 + leaq 8(%r11),%r11 + decl %ebx + jne .Lloop20 + + vmovdqa64 .Lmask52x4(%rip),%ymm4 + + vpbroadcastq %r9,%ymm3 + vpblendd $3,%ymm3,%ymm1,%ymm1 + + + + vpsrlq $52,%ymm1,%ymm24 + vpsrlq $52,%ymm16,%ymm25 + vpsrlq $52,%ymm17,%ymm26 + vpsrlq $52,%ymm18,%ymm27 + vpsrlq $52,%ymm19,%ymm28 + + + valignq $3,%ymm27,%ymm28,%ymm28 + valignq $3,%ymm26,%ymm27,%ymm27 + valignq $3,%ymm25,%ymm26,%ymm26 + valignq $3,%ymm24,%ymm25,%ymm25 + valignq $3,%ymm0,%ymm24,%ymm24 + + + vpandq %ymm4,%ymm1,%ymm1 + vpandq %ymm4,%ymm16,%ymm16 + vpandq %ymm4,%ymm17,%ymm17 + vpandq %ymm4,%ymm18,%ymm18 + vpandq %ymm4,%ymm19,%ymm19 + + + vpaddq %ymm24,%ymm1,%ymm1 + vpaddq %ymm25,%ymm16,%ymm16 + vpaddq %ymm26,%ymm17,%ymm17 + vpaddq %ymm27,%ymm18,%ymm18 + vpaddq %ymm28,%ymm19,%ymm19 + + + + vpcmpuq $1,%ymm1,%ymm4,%k1 + vpcmpuq $1,%ymm16,%ymm4,%k2 + vpcmpuq $1,%ymm17,%ymm4,%k3 + vpcmpuq $1,%ymm18,%ymm4,%k4 + vpcmpuq $1,%ymm19,%ymm4,%k5 + kmovb %k1,%r14d + kmovb %k2,%r13d + kmovb %k3,%r12d + kmovb %k4,%r11d + kmovb %k5,%r10d + + + vpcmpuq $0,%ymm1,%ymm4,%k1 + vpcmpuq $0,%ymm16,%ymm4,%k2 + vpcmpuq $0,%ymm17,%ymm4,%k3 + vpcmpuq $0,%ymm18,%ymm4,%k4 + vpcmpuq $0,%ymm19,%ymm4,%k5 + kmovb %k1,%r9d + kmovb %k2,%r8d + kmovb %k3,%ebx + kmovb %k4,%ecx + kmovb %k5,%edx + + + + shlb $4,%r13b + orb %r13b,%r14b + shlb $4,%r11b + orb %r11b,%r12b + + addb %r14b,%r14b + adcb %r12b,%r12b + adcb %r10b,%r10b + + shlb $4,%r8b + orb %r8b,%r9b + shlb $4,%cl + orb %cl,%bl + + addb %r9b,%r14b + adcb %bl,%r12b + adcb %dl,%r10b + + xorb %r9b,%r14b + xorb %bl,%r12b + xorb %dl,%r10b + + kmovb %r14d,%k1 + shrb $4,%r14b + kmovb %r14d,%k2 + kmovb %r12d,%k3 + shrb $4,%r12b + kmovb %r12d,%k4 + kmovb %r10d,%k5 + + + vpsubq %ymm4,%ymm1,%ymm1{%k1} + vpsubq %ymm4,%ymm16,%ymm16{%k2} + vpsubq %ymm4,%ymm17,%ymm17{%k3} + vpsubq %ymm4,%ymm18,%ymm18{%k4} + vpsubq %ymm4,%ymm19,%ymm19{%k5} + + vpandq %ymm4,%ymm1,%ymm1 + vpandq %ymm4,%ymm16,%ymm16 + vpandq %ymm4,%ymm17,%ymm17 + vpandq %ymm4,%ymm18,%ymm18 + vpandq %ymm4,%ymm19,%ymm19 + + vpbroadcastq %r15,%ymm3 + vpblendd $3,%ymm3,%ymm2,%ymm2 + + + + vpsrlq $52,%ymm2,%ymm24 + vpsrlq $52,%ymm20,%ymm25 + vpsrlq $52,%ymm21,%ymm26 + vpsrlq $52,%ymm22,%ymm27 + vpsrlq $52,%ymm23,%ymm28 + + + valignq $3,%ymm27,%ymm28,%ymm28 + valignq $3,%ymm26,%ymm27,%ymm27 + valignq $3,%ymm25,%ymm26,%ymm26 + valignq $3,%ymm24,%ymm25,%ymm25 + valignq $3,%ymm0,%ymm24,%ymm24 + + + vpandq %ymm4,%ymm2,%ymm2 + vpandq %ymm4,%ymm20,%ymm20 + vpandq %ymm4,%ymm21,%ymm21 + vpandq %ymm4,%ymm22,%ymm22 + vpandq %ymm4,%ymm23,%ymm23 + + + vpaddq %ymm24,%ymm2,%ymm2 + vpaddq %ymm25,%ymm20,%ymm20 + vpaddq %ymm26,%ymm21,%ymm21 + vpaddq %ymm27,%ymm22,%ymm22 + vpaddq %ymm28,%ymm23,%ymm23 + + + + vpcmpuq $1,%ymm2,%ymm4,%k1 + vpcmpuq $1,%ymm20,%ymm4,%k2 + vpcmpuq $1,%ymm21,%ymm4,%k3 + vpcmpuq $1,%ymm22,%ymm4,%k4 + vpcmpuq $1,%ymm23,%ymm4,%k5 + kmovb %k1,%r14d + kmovb %k2,%r13d + kmovb %k3,%r12d + kmovb %k4,%r11d + kmovb %k5,%r10d + + + vpcmpuq $0,%ymm2,%ymm4,%k1 + vpcmpuq $0,%ymm20,%ymm4,%k2 + vpcmpuq $0,%ymm21,%ymm4,%k3 + vpcmpuq $0,%ymm22,%ymm4,%k4 + vpcmpuq $0,%ymm23,%ymm4,%k5 + kmovb %k1,%r9d + kmovb %k2,%r8d + kmovb %k3,%ebx + kmovb %k4,%ecx + kmovb %k5,%edx + + + + shlb $4,%r13b + orb %r13b,%r14b + shlb $4,%r11b + orb %r11b,%r12b + + addb %r14b,%r14b + adcb %r12b,%r12b + adcb %r10b,%r10b + + shlb $4,%r8b + orb %r8b,%r9b + shlb $4,%cl + orb %cl,%bl + + addb %r9b,%r14b + adcb %bl,%r12b + adcb %dl,%r10b + + xorb %r9b,%r14b + xorb %bl,%r12b + xorb %dl,%r10b + + kmovb %r14d,%k1 + shrb $4,%r14b + kmovb %r14d,%k2 + kmovb %r12d,%k3 + shrb $4,%r12b + kmovb %r12d,%k4 + kmovb %r10d,%k5 + + + vpsubq %ymm4,%ymm2,%ymm2{%k1} + vpsubq %ymm4,%ymm20,%ymm20{%k2} + vpsubq %ymm4,%ymm21,%ymm21{%k3} + vpsubq %ymm4,%ymm22,%ymm22{%k4} + vpsubq %ymm4,%ymm23,%ymm23{%k5} + + vpandq %ymm4,%ymm2,%ymm2 + vpandq %ymm4,%ymm20,%ymm20 + vpandq %ymm4,%ymm21,%ymm21 + vpandq %ymm4,%ymm22,%ymm22 + vpandq %ymm4,%ymm23,%ymm23 + + vmovdqu64 %ymm1,(%rdi) + vmovdqu64 %ymm16,32(%rdi) + vmovdqu64 %ymm17,64(%rdi) + vmovdqu64 %ymm18,96(%rdi) + vmovdqu64 %ymm19,128(%rdi) + + vmovdqu64 %ymm2,160(%rdi) + vmovdqu64 %ymm20,192(%rdi) + vmovdqu64 %ymm21,224(%rdi) + vmovdqu64 %ymm22,256(%rdi) + vmovdqu64 %ymm23,288(%rdi) + + vzeroupper + movq 0(%rsp),%r15 +.cfi_restore %r15 + movq 8(%rsp),%r14 +.cfi_restore %r14 + movq 16(%rsp),%r13 +.cfi_restore %r13 + movq 24(%rsp),%r12 +.cfi_restore %r12 + movq 32(%rsp),%rbp +.cfi_restore %rbp + movq 40(%rsp),%rbx +.cfi_restore %rbx + leaq 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lrsaz_amm52x20_x2_256_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size ossl_rsaz_amm52x20_x2_256, .-ossl_rsaz_amm52x20_x2_256 +.text + +.align 32 +.globl ossl_extract_multiplier_2x20_win5 +.type ossl_extract_multiplier_2x20_win5,@function +ossl_extract_multiplier_2x20_win5: +.cfi_startproc +.byte 243,15,30,250 + leaq (%rcx,%rcx,4),%rax + salq $5,%rax + addq %rax,%rsi + + vmovdqa64 .Lones(%rip),%ymm23 + vpbroadcastq %rdx,%ymm22 + leaq 10240(%rsi),%rax + + vpxor %xmm4,%xmm4,%xmm4 + vmovdqa64 %ymm4,%ymm3 + vmovdqa64 %ymm4,%ymm2 + vmovdqa64 %ymm4,%ymm1 + vmovdqa64 %ymm4,%ymm0 + vmovdqa64 %ymm4,%ymm21 + +.align 32 +.Lloop: + vpcmpq $0,%ymm21,%ymm22,%k1 + addq $320,%rsi + vpaddq %ymm23,%ymm21,%ymm21 + vmovdqu64 -320(%rsi),%ymm16 + vmovdqu64 -288(%rsi),%ymm17 + vmovdqu64 -256(%rsi),%ymm18 + vmovdqu64 -224(%rsi),%ymm19 + vmovdqu64 -192(%rsi),%ymm20 + vpblendmq %ymm16,%ymm0,%ymm0{%k1} + vpblendmq %ymm17,%ymm1,%ymm1{%k1} + vpblendmq %ymm18,%ymm2,%ymm2{%k1} + vpblendmq %ymm19,%ymm3,%ymm3{%k1} + vpblendmq %ymm20,%ymm4,%ymm4{%k1} + cmpq %rsi,%rax + jne .Lloop + + vmovdqu64 %ymm0,(%rdi) + vmovdqu64 %ymm1,32(%rdi) + vmovdqu64 %ymm2,64(%rdi) + vmovdqu64 %ymm3,96(%rdi) + vmovdqu64 %ymm4,128(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size ossl_extract_multiplier_2x20_win5, .-ossl_extract_multiplier_2x20_win5 +.data +.align 32 +.Lones: +.quad 1,1,1,1 + .section ".note.gnu.property", "a" + .p2align 3 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + # "GNU" encoded with .byte, since .asciz isn't supported + # on Solaris. + .byte 0x47 + .byte 0x4e + .byte 0x55 + .byte 0 +1: + .p2align 3 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 3 +4: diff --git a/sys/crypto/openssl/amd64/rsaz-x86_64.S b/sys/crypto/openssl/amd64/rsaz-x86_64.S --- a/sys/crypto/openssl/amd64/rsaz-x86_64.S +++ b/sys/crypto/openssl/amd64/rsaz-x86_64.S @@ -2014,3 +2014,24 @@ .Linc: .long 0,0, 1,1 .long 2,2, 2,2 + .section ".note.gnu.property", "a" + .p2align 3 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + # "GNU" encoded with .byte, since .asciz isn't supported + # on Solaris. + .byte 0x47 + .byte 0x4e + .byte 0x55 + .byte 0 +1: + .p2align 3 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 3 +4: diff --git a/sys/crypto/openssl/amd64/sha1-mb-x86_64.S b/sys/crypto/openssl/amd64/sha1-mb-x86_64.S --- a/sys/crypto/openssl/amd64/sha1-mb-x86_64.S +++ b/sys/crypto/openssl/amd64/sha1-mb-x86_64.S @@ -30,28 +30,36 @@ .Loop_grande: movl %edx,280(%rsp) xorl %edx,%edx + movq 0(%rsi),%r8 + movl 8(%rsi),%ecx cmpl %edx,%ecx cmovgl %ecx,%edx testl %ecx,%ecx movl %ecx,0(%rbx) cmovleq %rbp,%r8 + movq 16(%rsi),%r9 + movl 24(%rsi),%ecx cmpl %edx,%ecx cmovgl %ecx,%edx testl %ecx,%ecx movl %ecx,4(%rbx) cmovleq %rbp,%r9 + movq 32(%rsi),%r10 + movl 40(%rsi),%ecx cmpl %edx,%ecx cmovgl %ecx,%edx testl %ecx,%ecx movl %ecx,8(%rbx) cmovleq %rbp,%r10 + movq 48(%rsi),%r11 + movl 56(%rsi),%ecx cmpl %edx,%ecx cmovgl %ecx,%edx @@ -2586,14 +2594,18 @@ .Loop_grande_shaext: movl %edx,280(%rsp) xorl %edx,%edx + movq 0(%rsi),%r8 + movl 8(%rsi),%ecx cmpl %edx,%ecx cmovgl %ecx,%edx testl %ecx,%ecx movl %ecx,0(%rbx) cmovleq %rsp,%r8 + movq 16(%rsi),%r9 + movl 24(%rsi),%ecx cmpl %edx,%ecx cmovgl %ecx,%edx @@ -2969,28 +2981,36 @@ .Loop_grande_avx: movl %edx,280(%rsp) xorl %edx,%edx + movq 0(%rsi),%r8 + movl 8(%rsi),%ecx cmpl %edx,%ecx cmovgl %ecx,%edx testl %ecx,%ecx movl %ecx,0(%rbx) cmovleq %rbp,%r8 + movq 16(%rsi),%r9 + movl 24(%rsi),%ecx cmpl %edx,%ecx cmovgl %ecx,%edx testl %ecx,%ecx movl %ecx,4(%rbx) cmovleq %rbp,%r9 + movq 32(%rsi),%r10 + movl 40(%rsi),%ecx cmpl %edx,%ecx cmovgl %ecx,%edx testl %ecx,%ecx movl %ecx,8(%rbx) cmovleq %rbp,%r10 + movq 48(%rsi),%r11 + movl 56(%rsi),%ecx cmpl %edx,%ecx cmovgl %ecx,%edx @@ -5054,56 +5074,72 @@ movl %edx,552(%rsp) xorl %edx,%edx leaq 512(%rsp),%rbx + movq 0(%rsi),%r12 + movl 8(%rsi),%ecx cmpl %edx,%ecx cmovgl %ecx,%edx testl %ecx,%ecx movl %ecx,0(%rbx) cmovleq %rbp,%r12 + movq 16(%rsi),%r13 + movl 24(%rsi),%ecx cmpl %edx,%ecx cmovgl %ecx,%edx testl %ecx,%ecx movl %ecx,4(%rbx) cmovleq %rbp,%r13 + movq 32(%rsi),%r14 + movl 40(%rsi),%ecx cmpl %edx,%ecx cmovgl %ecx,%edx testl %ecx,%ecx movl %ecx,8(%rbx) cmovleq %rbp,%r14 + movq 48(%rsi),%r15 + movl 56(%rsi),%ecx cmpl %edx,%ecx cmovgl %ecx,%edx testl %ecx,%ecx movl %ecx,12(%rbx) cmovleq %rbp,%r15 + movq 64(%rsi),%r8 + movl 72(%rsi),%ecx cmpl %edx,%ecx cmovgl %ecx,%edx testl %ecx,%ecx movl %ecx,16(%rbx) cmovleq %rbp,%r8 + movq 80(%rsi),%r9 + movl 88(%rsi),%ecx cmpl %edx,%ecx cmovgl %ecx,%edx testl %ecx,%ecx movl %ecx,20(%rbx) cmovleq %rbp,%r9 + movq 96(%rsi),%r10 + movl 104(%rsi),%ecx cmpl %edx,%ecx cmovgl %ecx,%edx testl %ecx,%ecx movl %ecx,24(%rbx) cmovleq %rbp,%r10 + movq 112(%rsi),%r11 + movl 120(%rsi),%ecx cmpl %edx,%ecx cmovgl %ecx,%edx @@ -7266,3 +7302,24 @@ .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f .byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0 .byte 83,72,65,49,32,109,117,108,116,105,45,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 + .section ".note.gnu.property", "a" + .p2align 3 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + # "GNU" encoded with .byte, since .asciz isn't supported + # on Solaris. + .byte 0x47 + .byte 0x4e + .byte 0x55 + .byte 0 +1: + .p2align 3 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 3 +4: diff --git a/sys/crypto/openssl/amd64/sha1-x86_64.S b/sys/crypto/openssl/amd64/sha1-x86_64.S --- a/sys/crypto/openssl/amd64/sha1-x86_64.S +++ b/sys/crypto/openssl/amd64/sha1-x86_64.S @@ -5449,3 +5449,24 @@ .byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0 .byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 64 + .section ".note.gnu.property", "a" + .p2align 3 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + # "GNU" encoded with .byte, since .asciz isn't supported + # on Solaris. + .byte 0x47 + .byte 0x4e + .byte 0x55 + .byte 0 +1: + .p2align 3 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 3 +4: diff --git a/sys/crypto/openssl/amd64/sha256-mb-x86_64.S b/sys/crypto/openssl/amd64/sha256-mb-x86_64.S --- a/sys/crypto/openssl/amd64/sha256-mb-x86_64.S +++ b/sys/crypto/openssl/amd64/sha256-mb-x86_64.S @@ -31,28 +31,36 @@ .Loop_grande: movl %edx,280(%rsp) xorl %edx,%edx + movq 0(%rsi),%r8 + movl 8(%rsi),%ecx cmpl %edx,%ecx cmovgl %ecx,%edx testl %ecx,%ecx movl %ecx,0(%rbx) cmovleq %rbp,%r8 + movq 16(%rsi),%r9 + movl 24(%rsi),%ecx cmpl %edx,%ecx cmovgl %ecx,%edx testl %ecx,%ecx movl %ecx,4(%rbx) cmovleq %rbp,%r9 + movq 32(%rsi),%r10 + movl 40(%rsi),%ecx cmpl %edx,%ecx cmovgl %ecx,%edx testl %ecx,%ecx movl %ecx,8(%rbx) cmovleq %rbp,%r10 + movq 48(%rsi),%r11 + movl 56(%rsi),%ecx cmpl %edx,%ecx cmovgl %ecx,%edx @@ -2655,14 +2663,18 @@ .Loop_grande_shaext: movl %edx,280(%rsp) xorl %edx,%edx + movq 0(%rsi),%r8 + movl 8(%rsi),%ecx cmpl %edx,%ecx cmovgl %ecx,%edx testl %ecx,%ecx movl %ecx,0(%rbx) cmovleq %rsp,%r8 + movq 16(%rsi),%r9 + movl 24(%rsi),%ecx cmpl %edx,%ecx cmovgl %ecx,%edx @@ -3157,28 +3169,36 @@ .Loop_grande_avx: movl %edx,280(%rsp) xorl %edx,%edx + movq 0(%rsi),%r8 + movl 8(%rsi),%ecx cmpl %edx,%ecx cmovgl %ecx,%edx testl %ecx,%ecx movl %ecx,0(%rbx) cmovleq %rbp,%r8 + movq 16(%rsi),%r9 + movl 24(%rsi),%ecx cmpl %edx,%ecx cmovgl %ecx,%edx testl %ecx,%ecx movl %ecx,4(%rbx) cmovleq %rbp,%r9 + movq 32(%rsi),%r10 + movl 40(%rsi),%ecx cmpl %edx,%ecx cmovgl %ecx,%edx testl %ecx,%ecx movl %ecx,8(%rbx) cmovleq %rbp,%r10 + movq 48(%rsi),%r11 + movl 56(%rsi),%ecx cmpl %edx,%ecx cmovgl %ecx,%edx @@ -5420,56 +5440,72 @@ movl %edx,552(%rsp) xorl %edx,%edx leaq 512(%rsp),%rbx + movq 0(%rsi),%r12 + movl 8(%rsi),%ecx cmpl %edx,%ecx cmovgl %ecx,%edx testl %ecx,%ecx movl %ecx,0(%rbx) cmovleq %rbp,%r12 + movq 16(%rsi),%r13 + movl 24(%rsi),%ecx cmpl %edx,%ecx cmovgl %ecx,%edx testl %ecx,%ecx movl %ecx,4(%rbx) cmovleq %rbp,%r13 + movq 32(%rsi),%r14 + movl 40(%rsi),%ecx cmpl %edx,%ecx cmovgl %ecx,%edx testl %ecx,%ecx movl %ecx,8(%rbx) cmovleq %rbp,%r14 + movq 48(%rsi),%r15 + movl 56(%rsi),%ecx cmpl %edx,%ecx cmovgl %ecx,%edx testl %ecx,%ecx movl %ecx,12(%rbx) cmovleq %rbp,%r15 + movq 64(%rsi),%r8 + movl 72(%rsi),%ecx cmpl %edx,%ecx cmovgl %ecx,%edx testl %ecx,%ecx movl %ecx,16(%rbx) cmovleq %rbp,%r8 + movq 80(%rsi),%r9 + movl 88(%rsi),%ecx cmpl %edx,%ecx cmovgl %ecx,%edx testl %ecx,%ecx movl %ecx,20(%rbx) cmovleq %rbp,%r9 + movq 96(%rsi),%r10 + movl 104(%rsi),%ecx cmpl %edx,%ecx cmovgl %ecx,%edx testl %ecx,%ecx movl %ecx,24(%rbx) cmovleq %rbp,%r10 + movq 112(%rsi),%r11 + movl 120(%rsi),%ecx cmpl %edx,%ecx cmovgl %ecx,%edx @@ -7947,3 +7983,24 @@ .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 .byte 83,72,65,50,53,54,32,109,117,108,116,105,45,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 + .section ".note.gnu.property", "a" + .p2align 3 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + # "GNU" encoded with .byte, since .asciz isn't supported + # on Solaris. + .byte 0x47 + .byte 0x4e + .byte 0x55 + .byte 0 +1: + .p2align 3 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 3 +4: diff --git a/sys/crypto/openssl/amd64/sha256-x86_64.S b/sys/crypto/openssl/amd64/sha256-x86_64.S --- a/sys/crypto/openssl/amd64/sha256-x86_64.S +++ b/sys/crypto/openssl/amd64/sha256-x86_64.S @@ -5455,3 +5455,24 @@ .byte 0xf3,0xc3 .cfi_endproc .size sha256_block_data_order_avx2,.-sha256_block_data_order_avx2 + .section ".note.gnu.property", "a" + .p2align 3 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + # "GNU" encoded with .byte, since .asciz isn't supported + # on Solaris. + .byte 0x47 + .byte 0x4e + .byte 0x55 + .byte 0 +1: + .p2align 3 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 3 +4: diff --git a/sys/crypto/openssl/amd64/sha512-x86_64.S b/sys/crypto/openssl/amd64/sha512-x86_64.S --- a/sys/crypto/openssl/amd64/sha512-x86_64.S +++ b/sys/crypto/openssl/amd64/sha512-x86_64.S @@ -5460,3 +5460,24 @@ .byte 0xf3,0xc3 .cfi_endproc .size sha512_block_data_order_avx2,.-sha512_block_data_order_avx2 + .section ".note.gnu.property", "a" + .p2align 3 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + # "GNU" encoded with .byte, since .asciz isn't supported + # on Solaris. + .byte 0x47 + .byte 0x4e + .byte 0x55 + .byte 0 +1: + .p2align 3 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 3 +4: diff --git a/sys/crypto/openssl/amd64/vpaes-x86_64.S b/sys/crypto/openssl/amd64/vpaes-x86_64.S --- a/sys/crypto/openssl/amd64/vpaes-x86_64.S +++ b/sys/crypto/openssl/amd64/vpaes-x86_64.S @@ -624,6 +624,7 @@ .align 16 vpaes_set_encrypt_key: .cfi_startproc +.byte 243,15,30,250 movl %esi,%eax shrl $5,%eax addl $5,%eax @@ -642,6 +643,7 @@ .align 16 vpaes_set_decrypt_key: .cfi_startproc +.byte 243,15,30,250 movl %esi,%eax shrl $5,%eax addl $5,%eax @@ -665,6 +667,7 @@ .align 16 vpaes_encrypt: .cfi_startproc +.byte 243,15,30,250 movdqu (%rdi),%xmm0 call _vpaes_preheat call _vpaes_encrypt_core @@ -678,6 +681,7 @@ .align 16 vpaes_decrypt: .cfi_startproc +.byte 243,15,30,250 movdqu (%rdi),%xmm0 call _vpaes_preheat call _vpaes_decrypt_core @@ -690,6 +694,7 @@ .align 16 vpaes_cbc_encrypt: .cfi_startproc +.byte 243,15,30,250 xchgq %rcx,%rdx subq $16,%rcx jc .Lcbc_abort @@ -852,3 +857,24 @@ .byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0 .align 64 .size _vpaes_consts,.-_vpaes_consts + .section ".note.gnu.property", "a" + .p2align 3 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + # "GNU" encoded with .byte, since .asciz isn't supported + # on Solaris. + .byte 0x47 + .byte 0x4e + .byte 0x55 + .byte 0 +1: + .p2align 3 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 3 +4: diff --git a/sys/crypto/openssl/amd64/wp-x86_64.S b/sys/crypto/openssl/amd64/wp-x86_64.S --- a/sys/crypto/openssl/amd64/wp-x86_64.S +++ b/sys/crypto/openssl/amd64/wp-x86_64.S @@ -878,3 +878,24 @@ .byte 228,39,65,139,167,125,149,216 .byte 251,238,124,102,221,23,71,158 .byte 202,45,191,7,173,90,131,51 + .section ".note.gnu.property", "a" + .p2align 3 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + # "GNU" encoded with .byte, since .asciz isn't supported + # on Solaris. + .byte 0x47 + .byte 0x4e + .byte 0x55 + .byte 0 +1: + .p2align 3 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 3 +4: diff --git a/sys/crypto/openssl/amd64/x25519-x86_64.S b/sys/crypto/openssl/amd64/x25519-x86_64.S --- a/sys/crypto/openssl/amd64/x25519-x86_64.S +++ b/sys/crypto/openssl/amd64/x25519-x86_64.S @@ -801,3 +801,24 @@ .cfi_endproc .size x25519_fe64_tobytes,.-x25519_fe64_tobytes .byte 88,50,53,53,49,57,32,112,114,105,109,105,116,105,118,101,115,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 + .section ".note.gnu.property", "a" + .p2align 3 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + # "GNU" encoded with .byte, since .asciz isn't supported + # on Solaris. + .byte 0x47 + .byte 0x4e + .byte 0x55 + .byte 0 +1: + .p2align 3 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 3 +4: diff --git a/sys/crypto/openssl/amd64/x86_64-gf2m.S b/sys/crypto/openssl/amd64/x86_64-gf2m.S --- a/sys/crypto/openssl/amd64/x86_64-gf2m.S +++ b/sys/crypto/openssl/amd64/x86_64-gf2m.S @@ -310,3 +310,24 @@ .size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2 .byte 71,70,40,50,94,109,41,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 16 + .section ".note.gnu.property", "a" + .p2align 3 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + # "GNU" encoded with .byte, since .asciz isn't supported + # on Solaris. + .byte 0x47 + .byte 0x4e + .byte 0x55 + .byte 0 +1: + .p2align 3 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 3 +4: diff --git a/sys/crypto/openssl/amd64/x86_64-mont.S b/sys/crypto/openssl/amd64/x86_64-mont.S --- a/sys/crypto/openssl/amd64/x86_64-mont.S +++ b/sys/crypto/openssl/amd64/x86_64-mont.S @@ -1238,3 +1238,24 @@ .size bn_mulx4x_mont,.-bn_mulx4x_mont .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 16 + .section ".note.gnu.property", "a" + .p2align 3 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + # "GNU" encoded with .byte, since .asciz isn't supported + # on Solaris. + .byte 0x47 + .byte 0x4e + .byte 0x55 + .byte 0 +1: + .p2align 3 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 3 +4: diff --git a/sys/crypto/openssl/amd64/x86_64-mont5.S b/sys/crypto/openssl/amd64/x86_64-mont5.S --- a/sys/crypto/openssl/amd64/x86_64-mont5.S +++ b/sys/crypto/openssl/amd64/x86_64-mont5.S @@ -3602,3 +3602,24 @@ .long 0,0, 1,1 .long 2,2, 2,2 .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 + .section ".note.gnu.property", "a" + .p2align 3 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + # "GNU" encoded with .byte, since .asciz isn't supported + # on Solaris. + .byte 0x47 + .byte 0x4e + .byte 0x55 + .byte 0 +1: + .p2align 3 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 3 +4: diff --git a/sys/crypto/openssl/amd64/x86_64cpuid.S b/sys/crypto/openssl/amd64/x86_64cpuid.S --- a/sys/crypto/openssl/amd64/x86_64cpuid.S +++ b/sys/crypto/openssl/amd64/x86_64cpuid.S @@ -13,6 +13,8 @@ .type OPENSSL_atomic_add,@function .align 16 OPENSSL_atomic_add: +.cfi_startproc +.byte 243,15,30,250 movl (%rdi),%eax .Lspin: leaq (%rsi,%rax,1),%r8 .byte 0xf0 @@ -21,16 +23,20 @@ movl %r8d,%eax .byte 0x48,0x98 .byte 0xf3,0xc3 +.cfi_endproc .size OPENSSL_atomic_add,.-OPENSSL_atomic_add .globl OPENSSL_rdtsc .type OPENSSL_rdtsc,@function .align 16 OPENSSL_rdtsc: +.cfi_startproc +.byte 243,15,30,250 rdtsc shlq $32,%rdx orq %rdx,%rax .byte 0xf3,0xc3 +.cfi_endproc .size OPENSSL_rdtsc,.-OPENSSL_rdtsc .globl OPENSSL_ia32_cpuid @@ -38,6 +44,7 @@ .align 16 OPENSSL_ia32_cpuid: .cfi_startproc +.byte 243,15,30,250 movq %rbx,%r8 .cfi_register %rbx,%r8 @@ -206,6 +213,8 @@ .type OPENSSL_cleanse,@function .align 16 OPENSSL_cleanse: +.cfi_startproc +.byte 243,15,30,250 xorq %rax,%rax cmpq $15,%rsi jae .Lot @@ -235,12 +244,15 @@ cmpq $0,%rsi jne .Little .byte 0xf3,0xc3 +.cfi_endproc .size OPENSSL_cleanse,.-OPENSSL_cleanse .globl CRYPTO_memcmp .type CRYPTO_memcmp,@function .align 16 CRYPTO_memcmp: +.cfi_startproc +.byte 243,15,30,250 xorq %rax,%rax xorq %r10,%r10 cmpq $0,%rdx @@ -269,11 +281,14 @@ shrq $63,%rax .Lno_data: .byte 0xf3,0xc3 +.cfi_endproc .size CRYPTO_memcmp,.-CRYPTO_memcmp .globl OPENSSL_wipe_cpu .type OPENSSL_wipe_cpu,@function .align 16 OPENSSL_wipe_cpu: +.cfi_startproc +.byte 243,15,30,250 pxor %xmm0,%xmm0 pxor %xmm1,%xmm1 pxor %xmm2,%xmm2 @@ -300,11 +315,14 @@ xorq %r11,%r11 leaq 8(%rsp),%rax .byte 0xf3,0xc3 +.cfi_endproc .size OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu .globl OPENSSL_instrument_bus .type OPENSSL_instrument_bus,@function .align 16 OPENSSL_instrument_bus: +.cfi_startproc +.byte 243,15,30,250 movq %rdi,%r10 movq %rsi,%rcx movq %rsi,%r11 @@ -331,12 +349,15 @@ movq %r11,%rax .byte 0xf3,0xc3 +.cfi_endproc .size OPENSSL_instrument_bus,.-OPENSSL_instrument_bus .globl OPENSSL_instrument_bus2 .type OPENSSL_instrument_bus2,@function .align 16 OPENSSL_instrument_bus2: +.cfi_startproc +.byte 243,15,30,250 movq %rdi,%r10 movq %rsi,%rcx movq %rdx,%r11 @@ -379,11 +400,14 @@ movq 8(%rsp),%rax subq %rcx,%rax .byte 0xf3,0xc3 +.cfi_endproc .size OPENSSL_instrument_bus2,.-OPENSSL_instrument_bus2 .globl OPENSSL_ia32_rdrand_bytes .type OPENSSL_ia32_rdrand_bytes,@function .align 16 OPENSSL_ia32_rdrand_bytes: +.cfi_startproc +.byte 243,15,30,250 xorq %rax,%rax cmpq $0,%rsi je .Ldone_rdrand_bytes @@ -420,11 +444,14 @@ .Ldone_rdrand_bytes: xorq %r10,%r10 .byte 0xf3,0xc3 +.cfi_endproc .size OPENSSL_ia32_rdrand_bytes,.-OPENSSL_ia32_rdrand_bytes .globl OPENSSL_ia32_rdseed_bytes .type OPENSSL_ia32_rdseed_bytes,@function .align 16 OPENSSL_ia32_rdseed_bytes: +.cfi_startproc +.byte 243,15,30,250 xorq %rax,%rax cmpq $0,%rsi je .Ldone_rdseed_bytes @@ -461,4 +488,26 @@ .Ldone_rdseed_bytes: xorq %r10,%r10 .byte 0xf3,0xc3 +.cfi_endproc .size OPENSSL_ia32_rdseed_bytes,.-OPENSSL_ia32_rdseed_bytes + .section ".note.gnu.property", "a" + .p2align 3 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + # "GNU" encoded with .byte, since .asciz isn't supported + # on Solaris. + .byte 0x47 + .byte 0x4e + .byte 0x55 + .byte 0 +1: + .p2align 3 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 3 +4: diff --git a/sys/crypto/openssl/arm/aes-armv4.S b/sys/crypto/openssl/arm/aes-armv4.S --- a/sys/crypto/openssl/arm/aes-armv4.S +++ b/sys/crypto/openssl/arm/aes-armv4.S @@ -1,7 +1,7 @@ /* Do not modify. This file is auto-generated from aes-armv4.pl. */ @ Copyright 2007-2020 The OpenSSL Project Authors. All Rights Reserved. @ -@ Licensed under the OpenSSL license (the "License"). You may not use +@ Licensed under the Apache License 2.0 (the "License"). You may not use @ this file except in compliance with the License. You can obtain a copy @ in the file LICENSE in the source distribution or at @ https://www.openssl.org/source/license.html @@ -39,13 +39,14 @@ @ Profiler-assisted and platform-specific optimization resulted in 16% @ improvement on Cortex A8 core and ~21.5 cycles per byte. +@ $output is the last argument if it looks like a file (it has an extension) +@ $flavour is the first argument if it doesn't look like a file #ifndef __KERNEL__ # include "arm_arch.h" #else # define __ARM_ARCH__ __LINUX_ARM_ARCH__ #endif -.text #if defined(__thumb2__) && !defined(__APPLE__) .syntax unified .thumb @@ -54,6 +55,8 @@ #undef __thumb2__ #endif +.text + .type AES_Te,%object .align 5 AES_Te: diff --git a/sys/crypto/openssl/arm/aesv8-armx.S b/sys/crypto/openssl/arm/aesv8-armx.S --- a/sys/crypto/openssl/arm/aesv8-armx.S +++ b/sys/crypto/openssl/arm/aesv8-armx.S @@ -2,11 +2,18 @@ #include "arm_arch.h" #if __ARM_MAX_ARCH__>=7 -.text .arch armv7-a @ don't confuse not-so-latest binutils with argv8 :-) .fpu neon +#ifdef __thumb2__ +.syntax unified +.thumb +# define INST(a,b,c,d) .byte c,d|0xc,a,b +#else .code 32 -#undef __thumb2__ +# define INST(a,b,c,d) .byte a,b,c,d +#endif + +.text .align 5 .Lrcon: .long 0x01,0x01,0x01,0x01 @@ -49,7 +56,7 @@ vtbl.8 d21,{q3},d5 vext.8 q9,q0,q3,#12 vst1.32 {q3},[r2]! -.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0 + INST(0x00,0x43,0xf0,0xf3) @ aese q10,q0 subs r1,r1,#1 veor q3,q3,q9 @@ -68,7 +75,7 @@ vtbl.8 d21,{q3},d5 vext.8 q9,q0,q3,#12 vst1.32 {q3},[r2]! -.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0 + INST(0x00,0x43,0xf0,0xf3) @ aese q10,q0 veor q3,q3,q9 vext.8 q9,q0,q9,#12 @@ -83,7 +90,7 @@ vtbl.8 d21,{q3},d5 vext.8 q9,q0,q3,#12 vst1.32 {q3},[r2]! -.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0 + INST(0x00,0x43,0xf0,0xf3) @ aese q10,q0 veor q3,q3,q9 vext.8 q9,q0,q9,#12 @@ -115,7 +122,7 @@ #else vst1.32 {d16},[r2]! #endif -.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0 + INST(0x00,0x43,0xf0,0xf3) @ aese q10,q0 subs r1,r1,#1 veor q3,q3,q9 @@ -151,7 +158,7 @@ vtbl.8 d21,{q8},d5 vext.8 q9,q0,q3,#12 vst1.32 {q8},[r2]! -.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0 + INST(0x00,0x43,0xf0,0xf3) @ aese q10,q0 subs r1,r1,#1 veor q3,q3,q9 @@ -167,7 +174,7 @@ vdup.32 q10,d7[1] vext.8 q9,q0,q8,#12 -.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0 + INST(0x00,0x43,0xf0,0xf3) @ aese q10,q0 veor q8,q8,q9 vext.8 q9,q0,q9,#12 @@ -210,15 +217,15 @@ .Loop_imc: vld1.32 {q0},[r2] vld1.32 {q1},[r0] -.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 -.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 + INST(0xc0,0x03,0xb0,0xf3) @ aesimc q0,q0 + INST(0xc2,0x23,0xb0,0xf3) @ aesimc q1,q1 vst1.32 {q0},[r0],r4 vst1.32 {q1},[r2]! cmp r0,r2 bhi .Loop_imc vld1.32 {q0},[r2] -.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 + INST(0xc0,0x03,0xb0,0xf3) @ aesimc q0,q0 vst1.32 {q0},[r0] eor r0,r0,r0 @ return value @@ -236,19 +243,19 @@ vld1.32 {q1},[r2]! .Loop_enc: -.byte 0x00,0x43,0xb0,0xf3 @ aese q2,q0 -.byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2 + INST(0x00,0x43,0xb0,0xf3) @ aese q2,q0 + INST(0x84,0x43,0xb0,0xf3) @ aesmc q2,q2 vld1.32 {q0},[r2]! subs r3,r3,#2 -.byte 0x02,0x43,0xb0,0xf3 @ aese q2,q1 -.byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2 + INST(0x02,0x43,0xb0,0xf3) @ aese q2,q1 + INST(0x84,0x43,0xb0,0xf3) @ aesmc q2,q2 vld1.32 {q1},[r2]! bgt .Loop_enc -.byte 0x00,0x43,0xb0,0xf3 @ aese q2,q0 -.byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2 + INST(0x00,0x43,0xb0,0xf3) @ aese q2,q0 + INST(0x84,0x43,0xb0,0xf3) @ aesmc q2,q2 vld1.32 {q0},[r2] -.byte 0x02,0x43,0xb0,0xf3 @ aese q2,q1 + INST(0x02,0x43,0xb0,0xf3) @ aese q2,q1 veor q2,q2,q0 vst1.8 {q2},[r1] @@ -265,24 +272,336 @@ vld1.32 {q1},[r2]! .Loop_dec: -.byte 0x40,0x43,0xb0,0xf3 @ aesd q2,q0 -.byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2 + INST(0x40,0x43,0xb0,0xf3) @ aesd q2,q0 + INST(0xc4,0x43,0xb0,0xf3) @ aesimc q2,q2 vld1.32 {q0},[r2]! subs r3,r3,#2 -.byte 0x42,0x43,0xb0,0xf3 @ aesd q2,q1 -.byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2 + INST(0x42,0x43,0xb0,0xf3) @ aesd q2,q1 + INST(0xc4,0x43,0xb0,0xf3) @ aesimc q2,q2 vld1.32 {q1},[r2]! bgt .Loop_dec -.byte 0x40,0x43,0xb0,0xf3 @ aesd q2,q0 -.byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2 + INST(0x40,0x43,0xb0,0xf3) @ aesd q2,q0 + INST(0xc4,0x43,0xb0,0xf3) @ aesimc q2,q2 vld1.32 {q0},[r2] -.byte 0x42,0x43,0xb0,0xf3 @ aesd q2,q1 + INST(0x42,0x43,0xb0,0xf3) @ aesd q2,q1 veor q2,q2,q0 vst1.8 {q2},[r1] bx lr .size aes_v8_decrypt,.-aes_v8_decrypt +.globl aes_v8_ecb_encrypt +.type aes_v8_ecb_encrypt,%function +.align 5 +aes_v8_ecb_encrypt: + mov ip,sp + stmdb sp!,{r4,r5,r6,r7,r8,lr} + vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI specification says so + ldmia ip,{r4,r5} @ load remaining args + subs r2,r2,#16 + mov r8,#16 + blo .Lecb_done + it eq + moveq r8,#0 + + cmp r4,#0 @ en- or decrypting? + ldr r5,[r3,#240] + and r2,r2,#-16 + vld1.8 {q0},[r0],r8 + + vld1.32 {q8,q9},[r3] @ load key schedule... + sub r5,r5,#6 + add r7,r3,r5,lsl#4 @ pointer to last 7 round keys + sub r5,r5,#2 + vld1.32 {q10,q11},[r7]! + vld1.32 {q12,q13},[r7]! + vld1.32 {q14,q15},[r7]! + vld1.32 {q7},[r7] + + add r7,r3,#32 + mov r6,r5 + beq .Lecb_dec + + vld1.8 {q1},[r0]! + subs r2,r2,#32 @ bias + add r6,r5,#2 + vorr q3,q1,q1 + vorr q10,q1,q1 + vorr q1,q0,q0 + blo .Lecb_enc_tail + + vorr q1,q3,q3 + vld1.8 {q10},[r0]! +.Loop3x_ecb_enc: + INST(0x20,0x03,0xb0,0xf3) @ aese q0,q8 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 + INST(0x20,0x23,0xb0,0xf3) @ aese q1,q8 + INST(0x82,0x23,0xb0,0xf3) @ aesmc q1,q1 + INST(0x20,0x43,0xf0,0xf3) @ aese q10,q8 + INST(0xa4,0x43,0xf0,0xf3) @ aesmc q10,q10 + vld1.32 {q8},[r7]! + subs r6,r6,#2 + INST(0x22,0x03,0xb0,0xf3) @ aese q0,q9 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 + INST(0x22,0x23,0xb0,0xf3) @ aese q1,q9 + INST(0x82,0x23,0xb0,0xf3) @ aesmc q1,q1 + INST(0x22,0x43,0xf0,0xf3) @ aese q10,q9 + INST(0xa4,0x43,0xf0,0xf3) @ aesmc q10,q10 + vld1.32 {q9},[r7]! + bgt .Loop3x_ecb_enc + + INST(0x20,0x03,0xb0,0xf3) @ aese q0,q8 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 + INST(0x20,0x23,0xb0,0xf3) @ aese q1,q8 + INST(0x82,0x23,0xb0,0xf3) @ aesmc q1,q1 + INST(0x20,0x43,0xf0,0xf3) @ aese q10,q8 + INST(0xa4,0x43,0xf0,0xf3) @ aesmc q10,q10 + subs r2,r2,#0x30 + it lo + movlo r6,r2 @ r6, r6, is zero at this point + INST(0x22,0x03,0xb0,0xf3) @ aese q0,q9 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 + INST(0x22,0x23,0xb0,0xf3) @ aese q1,q9 + INST(0x82,0x23,0xb0,0xf3) @ aesmc q1,q1 + INST(0x22,0x43,0xf0,0xf3) @ aese q10,q9 + INST(0xa4,0x43,0xf0,0xf3) @ aesmc q10,q10 + add r0,r0,r6 @ r0 is adjusted in such way that + @ at exit from the loop q1-q10 + @ are loaded with last "words" + mov r7,r3 + INST(0x28,0x03,0xb0,0xf3) @ aese q0,q12 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 + INST(0x28,0x23,0xb0,0xf3) @ aese q1,q12 + INST(0x82,0x23,0xb0,0xf3) @ aesmc q1,q1 + INST(0x28,0x43,0xf0,0xf3) @ aese q10,q12 + INST(0xa4,0x43,0xf0,0xf3) @ aesmc q10,q10 + vld1.8 {q2},[r0]! + INST(0x2a,0x03,0xb0,0xf3) @ aese q0,q13 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 + INST(0x2a,0x23,0xb0,0xf3) @ aese q1,q13 + INST(0x82,0x23,0xb0,0xf3) @ aesmc q1,q1 + INST(0x2a,0x43,0xf0,0xf3) @ aese q10,q13 + INST(0xa4,0x43,0xf0,0xf3) @ aesmc q10,q10 + vld1.8 {q3},[r0]! + INST(0x2c,0x03,0xb0,0xf3) @ aese q0,q14 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 + INST(0x2c,0x23,0xb0,0xf3) @ aese q1,q14 + INST(0x82,0x23,0xb0,0xf3) @ aesmc q1,q1 + INST(0x2c,0x43,0xf0,0xf3) @ aese q10,q14 + INST(0xa4,0x43,0xf0,0xf3) @ aesmc q10,q10 + vld1.8 {q11},[r0]! + INST(0x2e,0x03,0xb0,0xf3) @ aese q0,q15 + INST(0x2e,0x23,0xb0,0xf3) @ aese q1,q15 + INST(0x2e,0x43,0xf0,0xf3) @ aese q10,q15 + vld1.32 {q8},[r7]! @ re-pre-load rndkey[0] + add r6,r5,#2 + veor q4,q7,q0 + veor q5,q7,q1 + veor q10,q10,q7 + vld1.32 {q9},[r7]! @ re-pre-load rndkey[1] + vst1.8 {q4},[r1]! + vorr q0,q2,q2 + vst1.8 {q5},[r1]! + vorr q1,q3,q3 + vst1.8 {q10},[r1]! + vorr q10,q11,q11 + bhs .Loop3x_ecb_enc + + cmn r2,#0x30 + beq .Lecb_done + nop + +.Lecb_enc_tail: + INST(0x20,0x23,0xb0,0xf3) @ aese q1,q8 + INST(0x82,0x23,0xb0,0xf3) @ aesmc q1,q1 + INST(0x20,0x43,0xf0,0xf3) @ aese q10,q8 + INST(0xa4,0x43,0xf0,0xf3) @ aesmc q10,q10 + vld1.32 {q8},[r7]! + subs r6,r6,#2 + INST(0x22,0x23,0xb0,0xf3) @ aese q1,q9 + INST(0x82,0x23,0xb0,0xf3) @ aesmc q1,q1 + INST(0x22,0x43,0xf0,0xf3) @ aese q10,q9 + INST(0xa4,0x43,0xf0,0xf3) @ aesmc q10,q10 + vld1.32 {q9},[r7]! + bgt .Lecb_enc_tail + + INST(0x20,0x23,0xb0,0xf3) @ aese q1,q8 + INST(0x82,0x23,0xb0,0xf3) @ aesmc q1,q1 + INST(0x20,0x43,0xf0,0xf3) @ aese q10,q8 + INST(0xa4,0x43,0xf0,0xf3) @ aesmc q10,q10 + INST(0x22,0x23,0xb0,0xf3) @ aese q1,q9 + INST(0x82,0x23,0xb0,0xf3) @ aesmc q1,q1 + INST(0x22,0x43,0xf0,0xf3) @ aese q10,q9 + INST(0xa4,0x43,0xf0,0xf3) @ aesmc q10,q10 + INST(0x28,0x23,0xb0,0xf3) @ aese q1,q12 + INST(0x82,0x23,0xb0,0xf3) @ aesmc q1,q1 + INST(0x28,0x43,0xf0,0xf3) @ aese q10,q12 + INST(0xa4,0x43,0xf0,0xf3) @ aesmc q10,q10 + cmn r2,#0x20 + INST(0x2a,0x23,0xb0,0xf3) @ aese q1,q13 + INST(0x82,0x23,0xb0,0xf3) @ aesmc q1,q1 + INST(0x2a,0x43,0xf0,0xf3) @ aese q10,q13 + INST(0xa4,0x43,0xf0,0xf3) @ aesmc q10,q10 + INST(0x2c,0x23,0xb0,0xf3) @ aese q1,q14 + INST(0x82,0x23,0xb0,0xf3) @ aesmc q1,q1 + INST(0x2c,0x43,0xf0,0xf3) @ aese q10,q14 + INST(0xa4,0x43,0xf0,0xf3) @ aesmc q10,q10 + INST(0x2e,0x23,0xb0,0xf3) @ aese q1,q15 + INST(0x2e,0x43,0xf0,0xf3) @ aese q10,q15 + beq .Lecb_enc_one + veor q5,q7,q1 + veor q9,q7,q10 + vst1.8 {q5},[r1]! + vst1.8 {q9},[r1]! + b .Lecb_done + +.Lecb_enc_one: + veor q5,q7,q10 + vst1.8 {q5},[r1]! + b .Lecb_done +.align 5 +.Lecb_dec: + vld1.8 {q1},[r0]! + subs r2,r2,#32 @ bias + add r6,r5,#2 + vorr q3,q1,q1 + vorr q10,q1,q1 + vorr q1,q0,q0 + blo .Lecb_dec_tail + + vorr q1,q3,q3 + vld1.8 {q10},[r0]! +.Loop3x_ecb_dec: + INST(0x60,0x03,0xb0,0xf3) @ aesd q0,q8 + INST(0xc0,0x03,0xb0,0xf3) @ aesimc q0,q0 + INST(0x60,0x23,0xb0,0xf3) @ aesd q1,q8 + INST(0xc2,0x23,0xb0,0xf3) @ aesimc q1,q1 + INST(0x60,0x43,0xf0,0xf3) @ aesd q10,q8 + INST(0xe4,0x43,0xf0,0xf3) @ aesimc q10,q10 + vld1.32 {q8},[r7]! + subs r6,r6,#2 + INST(0x62,0x03,0xb0,0xf3) @ aesd q0,q9 + INST(0xc0,0x03,0xb0,0xf3) @ aesimc q0,q0 + INST(0x62,0x23,0xb0,0xf3) @ aesd q1,q9 + INST(0xc2,0x23,0xb0,0xf3) @ aesimc q1,q1 + INST(0x62,0x43,0xf0,0xf3) @ aesd q10,q9 + INST(0xe4,0x43,0xf0,0xf3) @ aesimc q10,q10 + vld1.32 {q9},[r7]! + bgt .Loop3x_ecb_dec + + INST(0x60,0x03,0xb0,0xf3) @ aesd q0,q8 + INST(0xc0,0x03,0xb0,0xf3) @ aesimc q0,q0 + INST(0x60,0x23,0xb0,0xf3) @ aesd q1,q8 + INST(0xc2,0x23,0xb0,0xf3) @ aesimc q1,q1 + INST(0x60,0x43,0xf0,0xf3) @ aesd q10,q8 + INST(0xe4,0x43,0xf0,0xf3) @ aesimc q10,q10 + subs r2,r2,#0x30 + it lo + movlo r6,r2 @ r6, r6, is zero at this point + INST(0x62,0x03,0xb0,0xf3) @ aesd q0,q9 + INST(0xc0,0x03,0xb0,0xf3) @ aesimc q0,q0 + INST(0x62,0x23,0xb0,0xf3) @ aesd q1,q9 + INST(0xc2,0x23,0xb0,0xf3) @ aesimc q1,q1 + INST(0x62,0x43,0xf0,0xf3) @ aesd q10,q9 + INST(0xe4,0x43,0xf0,0xf3) @ aesimc q10,q10 + add r0,r0,r6 @ r0 is adjusted in such way that + @ at exit from the loop q1-q10 + @ are loaded with last "words" + mov r7,r3 + INST(0x68,0x03,0xb0,0xf3) @ aesd q0,q12 + INST(0xc0,0x03,0xb0,0xf3) @ aesimc q0,q0 + INST(0x68,0x23,0xb0,0xf3) @ aesd q1,q12 + INST(0xc2,0x23,0xb0,0xf3) @ aesimc q1,q1 + INST(0x68,0x43,0xf0,0xf3) @ aesd q10,q12 + INST(0xe4,0x43,0xf0,0xf3) @ aesimc q10,q10 + vld1.8 {q2},[r0]! + INST(0x6a,0x03,0xb0,0xf3) @ aesd q0,q13 + INST(0xc0,0x03,0xb0,0xf3) @ aesimc q0,q0 + INST(0x6a,0x23,0xb0,0xf3) @ aesd q1,q13 + INST(0xc2,0x23,0xb0,0xf3) @ aesimc q1,q1 + INST(0x6a,0x43,0xf0,0xf3) @ aesd q10,q13 + INST(0xe4,0x43,0xf0,0xf3) @ aesimc q10,q10 + vld1.8 {q3},[r0]! + INST(0x6c,0x03,0xb0,0xf3) @ aesd q0,q14 + INST(0xc0,0x03,0xb0,0xf3) @ aesimc q0,q0 + INST(0x6c,0x23,0xb0,0xf3) @ aesd q1,q14 + INST(0xc2,0x23,0xb0,0xf3) @ aesimc q1,q1 + INST(0x6c,0x43,0xf0,0xf3) @ aesd q10,q14 + INST(0xe4,0x43,0xf0,0xf3) @ aesimc q10,q10 + vld1.8 {q11},[r0]! + INST(0x6e,0x03,0xb0,0xf3) @ aesd q0,q15 + INST(0x6e,0x23,0xb0,0xf3) @ aesd q1,q15 + INST(0x6e,0x43,0xf0,0xf3) @ aesd q10,q15 + vld1.32 {q8},[r7]! @ re-pre-load rndkey[0] + add r6,r5,#2 + veor q4,q7,q0 + veor q5,q7,q1 + veor q10,q10,q7 + vld1.32 {q9},[r7]! @ re-pre-load rndkey[1] + vst1.8 {q4},[r1]! + vorr q0,q2,q2 + vst1.8 {q5},[r1]! + vorr q1,q3,q3 + vst1.8 {q10},[r1]! + vorr q10,q11,q11 + bhs .Loop3x_ecb_dec + + cmn r2,#0x30 + beq .Lecb_done + nop + +.Lecb_dec_tail: + INST(0x60,0x23,0xb0,0xf3) @ aesd q1,q8 + INST(0xc2,0x23,0xb0,0xf3) @ aesimc q1,q1 + INST(0x60,0x43,0xf0,0xf3) @ aesd q10,q8 + INST(0xe4,0x43,0xf0,0xf3) @ aesimc q10,q10 + vld1.32 {q8},[r7]! + subs r6,r6,#2 + INST(0x62,0x23,0xb0,0xf3) @ aesd q1,q9 + INST(0xc2,0x23,0xb0,0xf3) @ aesimc q1,q1 + INST(0x62,0x43,0xf0,0xf3) @ aesd q10,q9 + INST(0xe4,0x43,0xf0,0xf3) @ aesimc q10,q10 + vld1.32 {q9},[r7]! + bgt .Lecb_dec_tail + + INST(0x60,0x23,0xb0,0xf3) @ aesd q1,q8 + INST(0xc2,0x23,0xb0,0xf3) @ aesimc q1,q1 + INST(0x60,0x43,0xf0,0xf3) @ aesd q10,q8 + INST(0xe4,0x43,0xf0,0xf3) @ aesimc q10,q10 + INST(0x62,0x23,0xb0,0xf3) @ aesd q1,q9 + INST(0xc2,0x23,0xb0,0xf3) @ aesimc q1,q1 + INST(0x62,0x43,0xf0,0xf3) @ aesd q10,q9 + INST(0xe4,0x43,0xf0,0xf3) @ aesimc q10,q10 + INST(0x68,0x23,0xb0,0xf3) @ aesd q1,q12 + INST(0xc2,0x23,0xb0,0xf3) @ aesimc q1,q1 + INST(0x68,0x43,0xf0,0xf3) @ aesd q10,q12 + INST(0xe4,0x43,0xf0,0xf3) @ aesimc q10,q10 + cmn r2,#0x20 + INST(0x6a,0x23,0xb0,0xf3) @ aesd q1,q13 + INST(0xc2,0x23,0xb0,0xf3) @ aesimc q1,q1 + INST(0x6a,0x43,0xf0,0xf3) @ aesd q10,q13 + INST(0xe4,0x43,0xf0,0xf3) @ aesimc q10,q10 + INST(0x6c,0x23,0xb0,0xf3) @ aesd q1,q14 + INST(0xc2,0x23,0xb0,0xf3) @ aesimc q1,q1 + INST(0x6c,0x43,0xf0,0xf3) @ aesd q10,q14 + INST(0xe4,0x43,0xf0,0xf3) @ aesimc q10,q10 + INST(0x6e,0x23,0xb0,0xf3) @ aesd q1,q15 + INST(0x6e,0x43,0xf0,0xf3) @ aesd q10,q15 + beq .Lecb_dec_one + veor q5,q7,q1 + veor q9,q7,q10 + vst1.8 {q5},[r1]! + vst1.8 {q9},[r1]! + b .Lecb_done + +.Lecb_dec_one: + veor q5,q7,q10 + vst1.8 {q5},[r1]! + +.Lecb_done: + vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15} + ldmia sp!,{r4,r5,r6,r7,r8,pc} +.size aes_v8_ecb_encrypt,.-aes_v8_ecb_encrypt .globl aes_v8_cbc_encrypt .type aes_v8_cbc_encrypt,%function .align 5 @@ -294,6 +613,7 @@ subs r2,r2,#16 mov r8,#16 blo .Lcbc_abort + it eq moveq r8,#0 cmp r5,#0 @ en- or decrypting? @@ -324,58 +644,59 @@ add r7,r3,#16 add r6,r3,#16*4 add r12,r3,#16*5 -.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + INST(0x20,0x03,0xb0,0xf3) @ aese q0,q8 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 add r14,r3,#16*6 add r3,r3,#16*7 b .Lenter_cbc_enc .align 4 .Loop_cbc_enc: -.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + INST(0x20,0x03,0xb0,0xf3) @ aese q0,q8 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 vst1.8 {q6},[r1]! .Lenter_cbc_enc: -.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x04,0x03,0xb0,0xf3 @ aese q0,q2 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + INST(0x22,0x03,0xb0,0xf3) @ aese q0,q9 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 + INST(0x04,0x03,0xb0,0xf3) @ aese q0,q2 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 vld1.32 {q8},[r6] cmp r5,#4 -.byte 0x06,0x03,0xb0,0xf3 @ aese q0,q3 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + INST(0x06,0x03,0xb0,0xf3) @ aese q0,q3 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 vld1.32 {q9},[r12] beq .Lcbc_enc192 -.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + INST(0x20,0x03,0xb0,0xf3) @ aese q0,q8 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 vld1.32 {q8},[r14] -.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + INST(0x22,0x03,0xb0,0xf3) @ aese q0,q9 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 vld1.32 {q9},[r3] nop .Lcbc_enc192: -.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + INST(0x20,0x03,0xb0,0xf3) @ aese q0,q8 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 subs r2,r2,#16 -.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + INST(0x22,0x03,0xb0,0xf3) @ aese q0,q9 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 + it eq moveq r8,#0 -.byte 0x24,0x03,0xb0,0xf3 @ aese q0,q10 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x26,0x03,0xb0,0xf3 @ aese q0,q11 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + INST(0x24,0x03,0xb0,0xf3) @ aese q0,q10 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 + INST(0x26,0x03,0xb0,0xf3) @ aese q0,q11 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 vld1.8 {q8},[r0],r8 -.byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + INST(0x28,0x03,0xb0,0xf3) @ aese q0,q12 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 veor q8,q8,q5 -.byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + INST(0x2a,0x03,0xb0,0xf3) @ aese q0,q13 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 vld1.32 {q9},[r7] @ re-pre-load rndkey[1] -.byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15 + INST(0x2c,0x03,0xb0,0xf3) @ aese q0,q14 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 + INST(0x2e,0x03,0xb0,0xf3) @ aese q0,q15 veor q6,q0,q7 bhs .Loop_cbc_enc @@ -385,35 +706,36 @@ .align 5 .Lcbc_enc128: vld1.32 {q2,q3},[r7] -.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + INST(0x20,0x03,0xb0,0xf3) @ aese q0,q8 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 b .Lenter_cbc_enc128 .Loop_cbc_enc128: -.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + INST(0x20,0x03,0xb0,0xf3) @ aese q0,q8 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 vst1.8 {q6},[r1]! .Lenter_cbc_enc128: -.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + INST(0x22,0x03,0xb0,0xf3) @ aese q0,q9 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 subs r2,r2,#16 -.byte 0x04,0x03,0xb0,0xf3 @ aese q0,q2 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + INST(0x04,0x03,0xb0,0xf3) @ aese q0,q2 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 + it eq moveq r8,#0 -.byte 0x06,0x03,0xb0,0xf3 @ aese q0,q3 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x24,0x03,0xb0,0xf3 @ aese q0,q10 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x26,0x03,0xb0,0xf3 @ aese q0,q11 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + INST(0x06,0x03,0xb0,0xf3) @ aese q0,q3 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 + INST(0x24,0x03,0xb0,0xf3) @ aese q0,q10 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 + INST(0x26,0x03,0xb0,0xf3) @ aese q0,q11 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 vld1.8 {q8},[r0],r8 -.byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + INST(0x28,0x03,0xb0,0xf3) @ aese q0,q12 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 + INST(0x2a,0x03,0xb0,0xf3) @ aese q0,q13 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 + INST(0x2c,0x03,0xb0,0xf3) @ aese q0,q14 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 veor q8,q8,q5 -.byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15 + INST(0x2e,0x03,0xb0,0xf3) @ aese q0,q15 veor q6,q0,q7 bhs .Loop_cbc_enc128 @@ -434,71 +756,71 @@ vorr q2,q0,q0 vorr q3,q1,q1 vorr q11,q10,q10 - .Loop3x_cbc_dec: -.byte 0x60,0x03,0xb0,0xf3 @ aesd q0,q8 -.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 -.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8 -.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 -.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8 -.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 + INST(0x60,0x03,0xb0,0xf3) @ aesd q0,q8 + INST(0xc0,0x03,0xb0,0xf3) @ aesimc q0,q0 + INST(0x60,0x23,0xb0,0xf3) @ aesd q1,q8 + INST(0xc2,0x23,0xb0,0xf3) @ aesimc q1,q1 + INST(0x60,0x43,0xf0,0xf3) @ aesd q10,q8 + INST(0xe4,0x43,0xf0,0xf3) @ aesimc q10,q10 vld1.32 {q8},[r7]! subs r6,r6,#2 -.byte 0x62,0x03,0xb0,0xf3 @ aesd q0,q9 -.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 -.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9 -.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 -.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9 -.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 + INST(0x62,0x03,0xb0,0xf3) @ aesd q0,q9 + INST(0xc0,0x03,0xb0,0xf3) @ aesimc q0,q0 + INST(0x62,0x23,0xb0,0xf3) @ aesd q1,q9 + INST(0xc2,0x23,0xb0,0xf3) @ aesimc q1,q1 + INST(0x62,0x43,0xf0,0xf3) @ aesd q10,q9 + INST(0xe4,0x43,0xf0,0xf3) @ aesimc q10,q10 vld1.32 {q9},[r7]! bgt .Loop3x_cbc_dec -.byte 0x60,0x03,0xb0,0xf3 @ aesd q0,q8 -.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 -.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8 -.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 -.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8 -.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 + INST(0x60,0x03,0xb0,0xf3) @ aesd q0,q8 + INST(0xc0,0x03,0xb0,0xf3) @ aesimc q0,q0 + INST(0x60,0x23,0xb0,0xf3) @ aesd q1,q8 + INST(0xc2,0x23,0xb0,0xf3) @ aesimc q1,q1 + INST(0x60,0x43,0xf0,0xf3) @ aesd q10,q8 + INST(0xe4,0x43,0xf0,0xf3) @ aesimc q10,q10 veor q4,q6,q7 subs r2,r2,#0x30 veor q5,q2,q7 + it lo movlo r6,r2 @ r6, r6, is zero at this point -.byte 0x62,0x03,0xb0,0xf3 @ aesd q0,q9 -.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 -.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9 -.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 -.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9 -.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 + INST(0x62,0x03,0xb0,0xf3) @ aesd q0,q9 + INST(0xc0,0x03,0xb0,0xf3) @ aesimc q0,q0 + INST(0x62,0x23,0xb0,0xf3) @ aesd q1,q9 + INST(0xc2,0x23,0xb0,0xf3) @ aesimc q1,q1 + INST(0x62,0x43,0xf0,0xf3) @ aesd q10,q9 + INST(0xe4,0x43,0xf0,0xf3) @ aesimc q10,q10 veor q9,q3,q7 add r0,r0,r6 @ r0 is adjusted in such way that @ at exit from the loop q1-q10 @ are loaded with last "words" vorr q6,q11,q11 mov r7,r3 -.byte 0x68,0x03,0xb0,0xf3 @ aesd q0,q12 -.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 -.byte 0x68,0x23,0xb0,0xf3 @ aesd q1,q12 -.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 -.byte 0x68,0x43,0xf0,0xf3 @ aesd q10,q12 -.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 + INST(0x68,0x03,0xb0,0xf3) @ aesd q0,q12 + INST(0xc0,0x03,0xb0,0xf3) @ aesimc q0,q0 + INST(0x68,0x23,0xb0,0xf3) @ aesd q1,q12 + INST(0xc2,0x23,0xb0,0xf3) @ aesimc q1,q1 + INST(0x68,0x43,0xf0,0xf3) @ aesd q10,q12 + INST(0xe4,0x43,0xf0,0xf3) @ aesimc q10,q10 vld1.8 {q2},[r0]! -.byte 0x6a,0x03,0xb0,0xf3 @ aesd q0,q13 -.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 -.byte 0x6a,0x23,0xb0,0xf3 @ aesd q1,q13 -.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 -.byte 0x6a,0x43,0xf0,0xf3 @ aesd q10,q13 -.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 + INST(0x6a,0x03,0xb0,0xf3) @ aesd q0,q13 + INST(0xc0,0x03,0xb0,0xf3) @ aesimc q0,q0 + INST(0x6a,0x23,0xb0,0xf3) @ aesd q1,q13 + INST(0xc2,0x23,0xb0,0xf3) @ aesimc q1,q1 + INST(0x6a,0x43,0xf0,0xf3) @ aesd q10,q13 + INST(0xe4,0x43,0xf0,0xf3) @ aesimc q10,q10 vld1.8 {q3},[r0]! -.byte 0x6c,0x03,0xb0,0xf3 @ aesd q0,q14 -.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 -.byte 0x6c,0x23,0xb0,0xf3 @ aesd q1,q14 -.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 -.byte 0x6c,0x43,0xf0,0xf3 @ aesd q10,q14 -.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 + INST(0x6c,0x03,0xb0,0xf3) @ aesd q0,q14 + INST(0xc0,0x03,0xb0,0xf3) @ aesimc q0,q0 + INST(0x6c,0x23,0xb0,0xf3) @ aesd q1,q14 + INST(0xc2,0x23,0xb0,0xf3) @ aesimc q1,q1 + INST(0x6c,0x43,0xf0,0xf3) @ aesd q10,q14 + INST(0xe4,0x43,0xf0,0xf3) @ aesimc q10,q10 vld1.8 {q11},[r0]! -.byte 0x6e,0x03,0xb0,0xf3 @ aesd q0,q15 -.byte 0x6e,0x23,0xb0,0xf3 @ aesd q1,q15 -.byte 0x6e,0x43,0xf0,0xf3 @ aesd q10,q15 + INST(0x6e,0x03,0xb0,0xf3) @ aesd q0,q15 + INST(0x6e,0x23,0xb0,0xf3) @ aesd q1,q15 + INST(0x6e,0x43,0xf0,0xf3) @ aesd q10,q15 vld1.32 {q8},[r7]! @ re-pre-load rndkey[0] add r6,r5,#2 veor q4,q4,q0 @@ -518,44 +840,44 @@ nop .Lcbc_dec_tail: -.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8 -.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 -.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8 -.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 + INST(0x60,0x23,0xb0,0xf3) @ aesd q1,q8 + INST(0xc2,0x23,0xb0,0xf3) @ aesimc q1,q1 + INST(0x60,0x43,0xf0,0xf3) @ aesd q10,q8 + INST(0xe4,0x43,0xf0,0xf3) @ aesimc q10,q10 vld1.32 {q8},[r7]! subs r6,r6,#2 -.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9 -.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 -.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9 -.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 + INST(0x62,0x23,0xb0,0xf3) @ aesd q1,q9 + INST(0xc2,0x23,0xb0,0xf3) @ aesimc q1,q1 + INST(0x62,0x43,0xf0,0xf3) @ aesd q10,q9 + INST(0xe4,0x43,0xf0,0xf3) @ aesimc q10,q10 vld1.32 {q9},[r7]! bgt .Lcbc_dec_tail -.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8 -.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 -.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8 -.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 -.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9 -.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 -.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9 -.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 -.byte 0x68,0x23,0xb0,0xf3 @ aesd q1,q12 -.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 -.byte 0x68,0x43,0xf0,0xf3 @ aesd q10,q12 -.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 + INST(0x60,0x23,0xb0,0xf3) @ aesd q1,q8 + INST(0xc2,0x23,0xb0,0xf3) @ aesimc q1,q1 + INST(0x60,0x43,0xf0,0xf3) @ aesd q10,q8 + INST(0xe4,0x43,0xf0,0xf3) @ aesimc q10,q10 + INST(0x62,0x23,0xb0,0xf3) @ aesd q1,q9 + INST(0xc2,0x23,0xb0,0xf3) @ aesimc q1,q1 + INST(0x62,0x43,0xf0,0xf3) @ aesd q10,q9 + INST(0xe4,0x43,0xf0,0xf3) @ aesimc q10,q10 + INST(0x68,0x23,0xb0,0xf3) @ aesd q1,q12 + INST(0xc2,0x23,0xb0,0xf3) @ aesimc q1,q1 + INST(0x68,0x43,0xf0,0xf3) @ aesd q10,q12 + INST(0xe4,0x43,0xf0,0xf3) @ aesimc q10,q10 cmn r2,#0x20 -.byte 0x6a,0x23,0xb0,0xf3 @ aesd q1,q13 -.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 -.byte 0x6a,0x43,0xf0,0xf3 @ aesd q10,q13 -.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 + INST(0x6a,0x23,0xb0,0xf3) @ aesd q1,q13 + INST(0xc2,0x23,0xb0,0xf3) @ aesimc q1,q1 + INST(0x6a,0x43,0xf0,0xf3) @ aesd q10,q13 + INST(0xe4,0x43,0xf0,0xf3) @ aesimc q10,q10 veor q5,q6,q7 -.byte 0x6c,0x23,0xb0,0xf3 @ aesd q1,q14 -.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 -.byte 0x6c,0x43,0xf0,0xf3 @ aesd q10,q14 -.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 + INST(0x6c,0x23,0xb0,0xf3) @ aesd q1,q14 + INST(0xc2,0x23,0xb0,0xf3) @ aesimc q1,q1 + INST(0x6c,0x43,0xf0,0xf3) @ aesd q10,q14 + INST(0xe4,0x43,0xf0,0xf3) @ aesimc q10,q10 veor q9,q3,q7 -.byte 0x6e,0x23,0xb0,0xf3 @ aesd q1,q15 -.byte 0x6e,0x43,0xf0,0xf3 @ aesd q10,q15 + INST(0x6e,0x23,0xb0,0xf3) @ aesd q1,q15 + INST(0x6e,0x43,0xf0,0xf3) @ aesd q10,q15 beq .Lcbc_dec_one veor q5,q5,q1 veor q9,q9,q10 @@ -602,6 +924,7 @@ vld1.32 {q7},[r7] add r7,r3,#32 mov r6,r5 + it lo movlo r12,#0 #ifndef __ARMEB__ rev r8, r8 @@ -621,76 +944,76 @@ .align 4 .Loop3x_ctr32: -.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8 -.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 -.byte 0x20,0x43,0xf0,0xf3 @ aese q10,q8 -.byte 0xa4,0x43,0xf0,0xf3 @ aesmc q10,q10 + INST(0x20,0x03,0xb0,0xf3) @ aese q0,q8 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 + INST(0x20,0x23,0xb0,0xf3) @ aese q1,q8 + INST(0x82,0x23,0xb0,0xf3) @ aesmc q1,q1 + INST(0x20,0x43,0xf0,0xf3) @ aese q10,q8 + INST(0xa4,0x43,0xf0,0xf3) @ aesmc q10,q10 vld1.32 {q8},[r7]! subs r6,r6,#2 -.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9 -.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 -.byte 0x22,0x43,0xf0,0xf3 @ aese q10,q9 -.byte 0xa4,0x43,0xf0,0xf3 @ aesmc q10,q10 + INST(0x22,0x03,0xb0,0xf3) @ aese q0,q9 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 + INST(0x22,0x23,0xb0,0xf3) @ aese q1,q9 + INST(0x82,0x23,0xb0,0xf3) @ aesmc q1,q1 + INST(0x22,0x43,0xf0,0xf3) @ aese q10,q9 + INST(0xa4,0x43,0xf0,0xf3) @ aesmc q10,q10 vld1.32 {q9},[r7]! bgt .Loop3x_ctr32 -.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 -.byte 0x80,0x83,0xb0,0xf3 @ aesmc q4,q0 -.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8 -.byte 0x82,0xa3,0xb0,0xf3 @ aesmc q5,q1 + INST(0x20,0x03,0xb0,0xf3) @ aese q0,q8 + INST(0x80,0x83,0xb0,0xf3) @ aesmc q4,q0 + INST(0x20,0x23,0xb0,0xf3) @ aese q1,q8 + INST(0x82,0xa3,0xb0,0xf3) @ aesmc q5,q1 vld1.8 {q2},[r0]! add r9,r8,#1 -.byte 0x20,0x43,0xf0,0xf3 @ aese q10,q8 -.byte 0xa4,0x43,0xf0,0xf3 @ aesmc q10,q10 + INST(0x20,0x43,0xf0,0xf3) @ aese q10,q8 + INST(0xa4,0x43,0xf0,0xf3) @ aesmc q10,q10 vld1.8 {q3},[r0]! rev r9,r9 -.byte 0x22,0x83,0xb0,0xf3 @ aese q4,q9 -.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4 -.byte 0x22,0xa3,0xb0,0xf3 @ aese q5,q9 -.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5 + INST(0x22,0x83,0xb0,0xf3) @ aese q4,q9 + INST(0x88,0x83,0xb0,0xf3) @ aesmc q4,q4 + INST(0x22,0xa3,0xb0,0xf3) @ aese q5,q9 + INST(0x8a,0xa3,0xb0,0xf3) @ aesmc q5,q5 vld1.8 {q11},[r0]! mov r7,r3 -.byte 0x22,0x43,0xf0,0xf3 @ aese q10,q9 -.byte 0xa4,0x23,0xf0,0xf3 @ aesmc q9,q10 -.byte 0x28,0x83,0xb0,0xf3 @ aese q4,q12 -.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4 -.byte 0x28,0xa3,0xb0,0xf3 @ aese q5,q12 -.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5 + INST(0x22,0x43,0xf0,0xf3) @ aese q10,q9 + INST(0xa4,0x23,0xf0,0xf3) @ aesmc q9,q10 + INST(0x28,0x83,0xb0,0xf3) @ aese q4,q12 + INST(0x88,0x83,0xb0,0xf3) @ aesmc q4,q4 + INST(0x28,0xa3,0xb0,0xf3) @ aese q5,q12 + INST(0x8a,0xa3,0xb0,0xf3) @ aesmc q5,q5 veor q2,q2,q7 add r10,r8,#2 -.byte 0x28,0x23,0xf0,0xf3 @ aese q9,q12 -.byte 0xa2,0x23,0xf0,0xf3 @ aesmc q9,q9 + INST(0x28,0x23,0xf0,0xf3) @ aese q9,q12 + INST(0xa2,0x23,0xf0,0xf3) @ aesmc q9,q9 veor q3,q3,q7 add r8,r8,#3 -.byte 0x2a,0x83,0xb0,0xf3 @ aese q4,q13 -.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4 -.byte 0x2a,0xa3,0xb0,0xf3 @ aese q5,q13 -.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5 + INST(0x2a,0x83,0xb0,0xf3) @ aese q4,q13 + INST(0x88,0x83,0xb0,0xf3) @ aesmc q4,q4 + INST(0x2a,0xa3,0xb0,0xf3) @ aese q5,q13 + INST(0x8a,0xa3,0xb0,0xf3) @ aesmc q5,q5 veor q11,q11,q7 vmov.32 d13[1], r9 -.byte 0x2a,0x23,0xf0,0xf3 @ aese q9,q13 -.byte 0xa2,0x23,0xf0,0xf3 @ aesmc q9,q9 + INST(0x2a,0x23,0xf0,0xf3) @ aese q9,q13 + INST(0xa2,0x23,0xf0,0xf3) @ aesmc q9,q9 vorr q0,q6,q6 rev r10,r10 -.byte 0x2c,0x83,0xb0,0xf3 @ aese q4,q14 -.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4 + INST(0x2c,0x83,0xb0,0xf3) @ aese q4,q14 + INST(0x88,0x83,0xb0,0xf3) @ aesmc q4,q4 vmov.32 d13[1], r10 rev r12,r8 -.byte 0x2c,0xa3,0xb0,0xf3 @ aese q5,q14 -.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5 + INST(0x2c,0xa3,0xb0,0xf3) @ aese q5,q14 + INST(0x8a,0xa3,0xb0,0xf3) @ aesmc q5,q5 vorr q1,q6,q6 vmov.32 d13[1], r12 -.byte 0x2c,0x23,0xf0,0xf3 @ aese q9,q14 -.byte 0xa2,0x23,0xf0,0xf3 @ aesmc q9,q9 + INST(0x2c,0x23,0xf0,0xf3) @ aese q9,q14 + INST(0xa2,0x23,0xf0,0xf3) @ aesmc q9,q9 vorr q10,q6,q6 subs r2,r2,#3 -.byte 0x2e,0x83,0xb0,0xf3 @ aese q4,q15 -.byte 0x2e,0xa3,0xb0,0xf3 @ aese q5,q15 -.byte 0x2e,0x23,0xf0,0xf3 @ aese q9,q15 + INST(0x2e,0x83,0xb0,0xf3) @ aese q4,q15 + INST(0x2e,0xa3,0xb0,0xf3) @ aese q5,q15 + INST(0x2e,0x23,0xf0,0xf3) @ aese q9,q15 veor q2,q2,q4 vld1.32 {q8},[r7]! @ re-pre-load rndkey[0] @@ -707,48 +1030,49 @@ beq .Lctr32_done cmp r2,#1 mov r12,#16 + it eq moveq r12,#0 .Lctr32_tail: -.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8 -.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 + INST(0x20,0x03,0xb0,0xf3) @ aese q0,q8 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 + INST(0x20,0x23,0xb0,0xf3) @ aese q1,q8 + INST(0x82,0x23,0xb0,0xf3) @ aesmc q1,q1 vld1.32 {q8},[r7]! subs r6,r6,#2 -.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9 -.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 + INST(0x22,0x03,0xb0,0xf3) @ aese q0,q9 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 + INST(0x22,0x23,0xb0,0xf3) @ aese q1,q9 + INST(0x82,0x23,0xb0,0xf3) @ aesmc q1,q1 vld1.32 {q9},[r7]! bgt .Lctr32_tail -.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8 -.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 -.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9 -.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 + INST(0x20,0x03,0xb0,0xf3) @ aese q0,q8 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 + INST(0x20,0x23,0xb0,0xf3) @ aese q1,q8 + INST(0x82,0x23,0xb0,0xf3) @ aesmc q1,q1 + INST(0x22,0x03,0xb0,0xf3) @ aese q0,q9 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 + INST(0x22,0x23,0xb0,0xf3) @ aese q1,q9 + INST(0x82,0x23,0xb0,0xf3) @ aesmc q1,q1 vld1.8 {q2},[r0],r12 -.byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x28,0x23,0xb0,0xf3 @ aese q1,q12 -.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 + INST(0x28,0x03,0xb0,0xf3) @ aese q0,q12 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 + INST(0x28,0x23,0xb0,0xf3) @ aese q1,q12 + INST(0x82,0x23,0xb0,0xf3) @ aesmc q1,q1 vld1.8 {q3},[r0] -.byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x2a,0x23,0xb0,0xf3 @ aese q1,q13 -.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 + INST(0x2a,0x03,0xb0,0xf3) @ aese q0,q13 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 + INST(0x2a,0x23,0xb0,0xf3) @ aese q1,q13 + INST(0x82,0x23,0xb0,0xf3) @ aesmc q1,q1 veor q2,q2,q7 -.byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x2c,0x23,0xb0,0xf3 @ aese q1,q14 -.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 + INST(0x2c,0x03,0xb0,0xf3) @ aese q0,q14 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 + INST(0x2c,0x23,0xb0,0xf3) @ aese q1,q14 + INST(0x82,0x23,0xb0,0xf3) @ aesmc q1,q1 veor q3,q3,q7 -.byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15 -.byte 0x2e,0x23,0xb0,0xf3 @ aese q1,q15 + INST(0x2e,0x03,0xb0,0xf3) @ aese q0,q15 + INST(0x2e,0x23,0xb0,0xf3) @ aese q1,q15 cmp r2,#1 veor q2,q2,q0 diff --git a/sys/crypto/openssl/arm/armv4-gf2m.S b/sys/crypto/openssl/arm/armv4-gf2m.S --- a/sys/crypto/openssl/arm/armv4-gf2m.S +++ b/sys/crypto/openssl/arm/armv4-gf2m.S @@ -1,13 +1,14 @@ /* Do not modify. This file is auto-generated from armv4-gf2m.pl. */ #include "arm_arch.h" -.text #if defined(__thumb2__) .syntax unified .thumb #else .code 32 #endif + +.text .type mul_1x1_ialu,%function .align 5 mul_1x1_ialu: @@ -100,11 +101,13 @@ #if __ARM_MAX_ARCH__>=7 stmdb sp!,{r10,lr} ldr r12,.LOPENSSL_armcap +# if !defined(_WIN32) adr r10,.LOPENSSL_armcap ldr r12,[r12,r10] -#ifdef __APPLE__ +# endif +# if defined(__APPLE__) || defined(_WIN32) ldr r12,[r12] -#endif +# endif tst r12,#ARMV7_NEON itt ne ldrne r10,[sp],#8 @@ -218,7 +221,11 @@ #if __ARM_MAX_ARCH__>=7 .align 5 .LOPENSSL_armcap: +# ifdef _WIN32 +.word OPENSSL_armcap_P +# else .word OPENSSL_armcap_P-. +# endif #endif .byte 71,70,40,50,94,109,41,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 2 diff --git a/sys/crypto/openssl/arm/armv4-mont.S b/sys/crypto/openssl/arm/armv4-mont.S --- a/sys/crypto/openssl/arm/armv4-mont.S +++ b/sys/crypto/openssl/arm/armv4-mont.S @@ -1,7 +1,6 @@ /* Do not modify. This file is auto-generated from armv4-mont.pl. */ #include "arm_arch.h" -.text #if defined(__thumb2__) .syntax unified .thumb @@ -9,10 +8,16 @@ .code 32 #endif +.text + #if __ARM_MAX_ARCH__>=7 .align 5 .LOPENSSL_armcap: +# ifdef _WIN32 +.word OPENSSL_armcap_P +# else .word OPENSSL_armcap_P-.Lbn_mul_mont +# endif #endif .globl bn_mul_mont @@ -26,12 +31,14 @@ #if __ARM_MAX_ARCH__>=7 tst ip,#7 bne .Lialu - adr r0,.Lbn_mul_mont - ldr r2,.LOPENSSL_armcap + ldr r0,.LOPENSSL_armcap +#if !defined(_WIN32) + adr r2,.Lbn_mul_mont ldr r0,[r0,r2] -#ifdef __APPLE__ +# endif +# if defined(__APPLE__) || defined(_WIN32) ldr r0,[r0] -#endif +# endif tst r0,#ARMV7_NEON @ NEON available? ldmia sp, {r0,r2} beq .Lialu diff --git a/sys/crypto/openssl/arm/armv4cpuid.S b/sys/crypto/openssl/arm/armv4cpuid.S --- a/sys/crypto/openssl/arm/armv4cpuid.S +++ b/sys/crypto/openssl/arm/armv4cpuid.S @@ -1,7 +1,6 @@ /* Do not modify. This file is auto-generated from armv4cpuid.pl. */ #include "arm_arch.h" -.text #if defined(__thumb2__) && !defined(__APPLE__) .syntax unified .thumb @@ -10,6 +9,8 @@ #undef __thumb2__ #endif +.text + .align 5 .globl OPENSSL_atomic_add .type OPENSSL_atomic_add,%function diff --git a/sys/crypto/openssl/arm/bsaes-armv7.S b/sys/crypto/openssl/arm/bsaes-armv7.S --- a/sys/crypto/openssl/arm/bsaes-armv7.S +++ b/sys/crypto/openssl/arm/bsaes-armv7.S @@ -1,7 +1,7 @@ /* Do not modify. This file is auto-generated from bsaes-armv7.pl. */ -@ Copyright 2012-2020 The OpenSSL Project Authors. All Rights Reserved. +@ Copyright 2012-2023 The OpenSSL Project Authors. All Rights Reserved. @ -@ Licensed under the OpenSSL license (the "License"). You may not use +@ Licensed under the Apache License 2.0 (the "License"). You may not use @ this file except in compliance with the License. You can obtain a copy @ in the file LICENSE in the source distribution or at @ https://www.openssl.org/source/license.html @@ -14,7 +14,7 @@ @ details see http://www.openssl.org/~appro/cryptogams/. @ @ Specific modes and adaptation for Linux kernel by Ard Biesheuvel -@ of Linaro. Permission to use under GPL terms is granted. +@ of Linaro. @ ==================================================================== @ Bit-sliced AES for ARM NEON @@ -50,6 +50,8 @@ @ April-August 2013 @ Add CBC, CTR and XTS subroutines and adapt for kernel use; courtesy of Ard. +@ $output is the last argument if it looks like a file (it has an extension) +@ $flavour is the first argument if it doesn't look like a file #ifndef __KERNEL__ # include "arm_arch.h" @@ -74,7 +76,6 @@ .arch armv7-a .fpu neon -.text .syntax unified @ ARMv7-capable assembler is expected to handle this #if defined(__thumb2__) && !defined(__APPLE__) .thumb @@ -83,6 +84,8 @@ # undef __thumb2__ #endif +.text + .type _bsaes_decrypt8,%function .align 4 _bsaes_decrypt8: @@ -1071,18 +1074,18 @@ -.globl bsaes_cbc_encrypt -.type bsaes_cbc_encrypt,%function +.globl ossl_bsaes_cbc_encrypt +.type ossl_bsaes_cbc_encrypt,%function .align 5 -bsaes_cbc_encrypt: +ossl_bsaes_cbc_encrypt: #ifndef __KERNEL__ cmp r2, #128 #ifndef __thumb__ blo AES_cbc_encrypt #else - bhs 1f + bhs .Lcbc_do_bsaes b AES_cbc_encrypt -1: +.Lcbc_do_bsaes: #endif #endif @@ -1336,12 +1339,12 @@ vst1.8 {q15}, [r8] @ return IV VFP_ABI_POP ldmia sp!, {r4,r5,r6,r7,r8,r9,r10, pc} -.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt +.size ossl_bsaes_cbc_encrypt,.-ossl_bsaes_cbc_encrypt -.globl bsaes_ctr32_encrypt_blocks -.type bsaes_ctr32_encrypt_blocks,%function +.globl ossl_bsaes_ctr32_encrypt_blocks +.type ossl_bsaes_ctr32_encrypt_blocks,%function .align 5 -bsaes_ctr32_encrypt_blocks: +ossl_bsaes_ctr32_encrypt_blocks: cmp r2, #8 @ use plain AES for blo .Lctr_enc_short @ small sizes @@ -1564,11 +1567,11 @@ vstmia sp!, {q0,q1} ldmia sp!, {r4,r5,r6,r7,r8, pc} -.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks -.globl bsaes_xts_encrypt -.type bsaes_xts_encrypt,%function +.size ossl_bsaes_ctr32_encrypt_blocks,.-ossl_bsaes_ctr32_encrypt_blocks +.globl ossl_bsaes_xts_encrypt +.type ossl_bsaes_xts_encrypt,%function .align 4 -bsaes_xts_encrypt: +ossl_bsaes_xts_encrypt: mov ip, sp stmdb sp!, {r4,r5,r6,r7,r8,r9,r10, lr} @ 0x20 VFP_ABI_PUSH @@ -2043,12 +2046,12 @@ VFP_ABI_POP ldmia sp!, {r4,r5,r6,r7,r8,r9,r10, pc} @ return -.size bsaes_xts_encrypt,.-bsaes_xts_encrypt +.size ossl_bsaes_xts_encrypt,.-ossl_bsaes_xts_encrypt -.globl bsaes_xts_decrypt -.type bsaes_xts_decrypt,%function +.globl ossl_bsaes_xts_decrypt +.type ossl_bsaes_xts_decrypt,%function .align 4 -bsaes_xts_decrypt: +ossl_bsaes_xts_decrypt: mov ip, sp stmdb sp!, {r4,r5,r6,r7,r8,r9,r10, lr} @ 0x20 VFP_ABI_PUSH @@ -2554,5 +2557,5 @@ VFP_ABI_POP ldmia sp!, {r4,r5,r6,r7,r8,r9,r10, pc} @ return -.size bsaes_xts_decrypt,.-bsaes_xts_decrypt +.size ossl_bsaes_xts_decrypt,.-ossl_bsaes_xts_decrypt #endif diff --git a/sys/crypto/openssl/arm/chacha-armv4.S b/sys/crypto/openssl/arm/chacha-armv4.S --- a/sys/crypto/openssl/arm/chacha-armv4.S +++ b/sys/crypto/openssl/arm/chacha-armv4.S @@ -1,7 +1,6 @@ /* Do not modify. This file is auto-generated from chacha-armv4.pl. */ #include "arm_arch.h" -.text #if defined(__thumb2__) || defined(__clang__) .syntax unified #endif @@ -15,6 +14,8 @@ #define ldrhsb ldrbhs #endif +.text + .align 5 .Lsigma: .long 0x61707865,0x3320646e,0x79622d32,0x6b206574 @ endian-neutral @@ -22,7 +23,11 @@ .long 1,0,0,0 #if __ARM_MAX_ARCH__>=7 .LOPENSSL_armcap: +# ifdef _WIN32 +.word OPENSSL_armcap_P +# else .word OPENSSL_armcap_P-.LChaCha20_ctr32 +# endif #else .word -1 #endif @@ -49,8 +54,10 @@ cmp r2,#192 @ test len bls .Lshort ldr r4,[r14,#-32] +# if !defined(_WIN32) ldr r4,[r14,r4] -# ifdef __APPLE__ +# endif +# if defined(__APPLE__) || defined(_WIN32) ldr r4,[r4] # endif tst r4,#ARMV7_NEON diff --git a/sys/crypto/openssl/arm/ecp_nistz256-armv4.S b/sys/crypto/openssl/arm/ecp_nistz256-armv4.S --- a/sys/crypto/openssl/arm/ecp_nistz256-armv4.S +++ b/sys/crypto/openssl/arm/ecp_nistz256-armv4.S @@ -1,13 +1,13 @@ /* Do not modify. This file is auto-generated from ecp_nistz256-armv4.pl. */ #include "arm_arch.h" -.text #if defined(__thumb2__) .syntax unified .thumb #else .code 32 #endif +.section .rodata .globl ecp_nistz256_precomputed .type ecp_nistz256_precomputed,%object .align 12 @@ -2381,6 +2381,8 @@ .byte 0xec,0xf0,0x42,0x88,0xd0,0x81,0x51,0xf9,0x1b,0xbc,0x43,0xa4,0x37,0xf1,0xd7,0x90,0x21,0x7e,0xa0,0x3e,0x63,0xfb,0x21,0xfa,0x12,0xfb,0xde,0xc7,0xbf,0xb3,0x58,0xe7,0x76,0x42,0x20,0x01,0x3d,0x66,0x80,0xf1,0xb8,0xaf,0xfa,0x7d,0x96,0x89,0x36,0x48,0x95,0xd9,0x6e,0x6d,0xe6,0x4f,0xff,0x2a,0x47,0x61,0xf2,0x04,0xb7,0x83,0x14,0xce .byte 0x0a,0x3c,0x73,0x17,0x50,0x88,0x03,0x25,0x4a,0xe3,0x13,0x55,0x8b,0x7e,0x50,0x38,0xfc,0x14,0x0b,0x04,0x8e,0xa8,0x5b,0xd6,0x72,0x20,0x60,0xe9,0xaa,0x22,0x82,0x11,0xc6,0xc4,0xd7,0xb9,0xc8,0x0c,0x7e,0x05,0xfb,0x90,0xe4,0x9c,0x28,0x89,0x29,0x99,0x63,0x4d,0xec,0x7b,0x50,0xbd,0xd8,0xa3,0x5b,0x50,0x77,0x19,0x81,0x92,0xce,0x82 .size ecp_nistz256_precomputed,.-ecp_nistz256_precomputed + +.text .align 5 .LRR:@ 2^512 mod P precomputed for NIST P256 polynomial .long 0x00000003, 0x00000000, 0xffffffff, 0xfffffffb diff --git a/sys/crypto/openssl/arm/ghash-armv4.S b/sys/crypto/openssl/arm/ghash-armv4.S --- a/sys/crypto/openssl/arm/ghash-armv4.S +++ b/sys/crypto/openssl/arm/ghash-armv4.S @@ -1,7 +1,6 @@ /* Do not modify. This file is auto-generated from ghash-armv4.pl. */ #include "arm_arch.h" -.text #if defined(__thumb2__) || defined(__clang__) .syntax unified #define ldrplb ldrbpl @@ -13,6 +12,8 @@ .code 32 #endif +.text + .type rem_4bit,%object .align 5 rem_4bit: diff --git a/sys/crypto/openssl/arm/ghashv8-armx.S b/sys/crypto/openssl/arm/ghashv8-armx.S --- a/sys/crypto/openssl/arm/ghashv8-armx.S +++ b/sys/crypto/openssl/arm/ghashv8-armx.S @@ -2,10 +2,17 @@ #include "arm_arch.h" #if __ARM_MAX_ARCH__>=7 -.text .fpu neon +#ifdef __thumb2__ +.syntax unified +.thumb +# define INST(a,b,c,d) .byte c,0xef,a,b +#else .code 32 -#undef __thumb2__ +# define INST(a,b,c,d) .byte a,b,c,0xf2 +#endif + +.text .globl gcm_init_v8 .type gcm_init_v8,%function .align 4 @@ -29,23 +36,23 @@ @ calculate H^2 vext.8 q8,q12,q12,#8 @ Karatsuba pre-processing -.byte 0xa8,0x0e,0xa8,0xf2 @ pmull q0,q12,q12 + INST(0xa8,0x0e,0xa8,0xf2) @ pmull q0,q12,q12 veor q8,q8,q12 -.byte 0xa9,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q12 -.byte 0xa0,0x2e,0xa0,0xf2 @ pmull q1,q8,q8 + INST(0xa9,0x4e,0xa9,0xf2) @ pmull2 q2,q12,q12 + INST(0xa0,0x2e,0xa0,0xf2) @ pmull q1,q8,q8 vext.8 q9,q0,q2,#8 @ Karatsuba post-processing veor q10,q0,q2 veor q1,q1,q9 veor q1,q1,q10 -.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase + INST(0x26,0x4e,0xe0,0xf2) @ pmull q10,q0,q11 @ 1st phase vmov d4,d3 @ Xh|Xm - 256-bit result vmov d3,d0 @ Xm is rotated Xl veor q0,q1,q10 vext.8 q10,q0,q0,#8 @ 2nd phase -.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11 + INST(0x26,0x0e,0xa0,0xf2) @ pmull q0,q0,q11 veor q10,q10,q2 veor q14,q0,q10 @@ -68,23 +75,23 @@ #endif vext.8 q3,q9,q9,#8 -.byte 0x86,0x0e,0xa8,0xf2 @ pmull q0,q12,q3 @ H.lo·Xi.lo + INST(0x86,0x0e,0xa8,0xf2) @ pmull q0,q12,q3 @ H.lo·Xi.lo veor q9,q9,q3 @ Karatsuba pre-processing -.byte 0x87,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q3 @ H.hi·Xi.hi -.byte 0xa2,0x2e,0xaa,0xf2 @ pmull q1,q13,q9 @ (H.lo+H.hi)·(Xi.lo+Xi.hi) + INST(0x87,0x4e,0xa9,0xf2) @ pmull2 q2,q12,q3 @ H.hi·Xi.hi + INST(0xa2,0x2e,0xaa,0xf2) @ pmull q1,q13,q9 @ (H.lo+H.hi)·(Xi.lo+Xi.hi) vext.8 q9,q0,q2,#8 @ Karatsuba post-processing veor q10,q0,q2 veor q1,q1,q9 veor q1,q1,q10 -.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction + INST(0x26,0x4e,0xe0,0xf2) @ pmull q10,q0,q11 @ 1st phase of reduction vmov d4,d3 @ Xh|Xm - 256-bit result vmov d3,d0 @ Xm is rotated Xl veor q0,q1,q10 vext.8 q10,q0,q0,#8 @ 2nd phase of reduction -.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11 + INST(0x26,0x0e,0xa0,0xf2) @ pmull q0,q0,q11 veor q10,q10,q2 veor q0,q0,q10 @@ -120,6 +127,7 @@ vld1.64 {q12,q13},[r1]! @ load twisted H, ..., H^2 vmov.i8 q11,#0xe1 vld1.64 {q14},[r1] + it eq moveq r12,#0 @ is it time to zero r12? vext.8 q0,q0,q0,#8 @ rotate Xi vld1.64 {q8},[r2]! @ load [rotated] I[0] @@ -136,26 +144,28 @@ #endif vext.8 q7,q9,q9,#8 veor q3,q3,q0 @ I[i]^=Xi -.byte 0x8e,0x8e,0xa8,0xf2 @ pmull q4,q12,q7 @ H·Ii+1 + INST(0x8e,0x8e,0xa8,0xf2) @ pmull q4,q12,q7 @ H·Ii+1 veor q9,q9,q7 @ Karatsuba pre-processing -.byte 0x8f,0xce,0xa9,0xf2 @ pmull2 q6,q12,q7 + INST(0x8f,0xce,0xa9,0xf2) @ pmull2 q6,q12,q7 b .Loop_mod2x_v8 .align 4 .Loop_mod2x_v8: vext.8 q10,q3,q3,#8 subs r3,r3,#32 @ is there more data? -.byte 0x86,0x0e,0xac,0xf2 @ pmull q0,q14,q3 @ H^2.lo·Xi.lo + INST(0x86,0x0e,0xac,0xf2) @ pmull q0,q14,q3 @ H^2.lo·Xi.lo + it lo movlo r12,#0 @ is it time to zero r12? -.byte 0xa2,0xae,0xaa,0xf2 @ pmull q5,q13,q9 + INST(0xa2,0xae,0xaa,0xf2) @ pmull q5,q13,q9 veor q10,q10,q3 @ Karatsuba pre-processing -.byte 0x87,0x4e,0xad,0xf2 @ pmull2 q2,q14,q3 @ H^2.hi·Xi.hi + INST(0x87,0x4e,0xad,0xf2) @ pmull2 q2,q14,q3 @ H^2.hi·Xi.hi veor q0,q0,q4 @ accumulate -.byte 0xa5,0x2e,0xab,0xf2 @ pmull2 q1,q13,q10 @ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi) + INST(0xa5,0x2e,0xab,0xf2) @ pmull2 q1,q13,q10 @ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi) vld1.64 {q8},[r2],r12 @ load [rotated] I[i+2] veor q2,q2,q6 + it eq moveq r12,#0 @ is it time to zero r12? veor q1,q1,q5 @@ -167,7 +177,7 @@ vrev64.8 q8,q8 #endif veor q1,q1,q10 -.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction + INST(0x26,0x4e,0xe0,0xf2) @ pmull q10,q0,q11 @ 1st phase of reduction #ifndef __ARMEB__ vrev64.8 q9,q9 @@ -177,15 +187,15 @@ vext.8 q7,q9,q9,#8 vext.8 q3,q8,q8,#8 veor q0,q1,q10 -.byte 0x8e,0x8e,0xa8,0xf2 @ pmull q4,q12,q7 @ H·Ii+1 + INST(0x8e,0x8e,0xa8,0xf2) @ pmull q4,q12,q7 @ H·Ii+1 veor q3,q3,q2 @ accumulate q3 early vext.8 q10,q0,q0,#8 @ 2nd phase of reduction -.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11 + INST(0x26,0x0e,0xa0,0xf2) @ pmull q0,q0,q11 veor q3,q3,q10 veor q9,q9,q7 @ Karatsuba pre-processing veor q3,q3,q0 -.byte 0x8f,0xce,0xa9,0xf2 @ pmull2 q6,q12,q7 + INST(0x8f,0xce,0xa9,0xf2) @ pmull2 q6,q12,q7 bhs .Loop_mod2x_v8 @ there was at least 32 more bytes veor q2,q2,q10 @@ -198,23 +208,23 @@ veor q3,q3,q0 @ inp^=Xi veor q9,q8,q10 @ q9 is rotated inp^Xi -.byte 0x86,0x0e,0xa8,0xf2 @ pmull q0,q12,q3 @ H.lo·Xi.lo + INST(0x86,0x0e,0xa8,0xf2) @ pmull q0,q12,q3 @ H.lo·Xi.lo veor q9,q9,q3 @ Karatsuba pre-processing -.byte 0x87,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q3 @ H.hi·Xi.hi -.byte 0xa2,0x2e,0xaa,0xf2 @ pmull q1,q13,q9 @ (H.lo+H.hi)·(Xi.lo+Xi.hi) + INST(0x87,0x4e,0xa9,0xf2) @ pmull2 q2,q12,q3 @ H.hi·Xi.hi + INST(0xa2,0x2e,0xaa,0xf2) @ pmull q1,q13,q9 @ (H.lo+H.hi)·(Xi.lo+Xi.hi) vext.8 q9,q0,q2,#8 @ Karatsuba post-processing veor q10,q0,q2 veor q1,q1,q9 veor q1,q1,q10 -.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction + INST(0x26,0x4e,0xe0,0xf2) @ pmull q10,q0,q11 @ 1st phase of reduction vmov d4,d3 @ Xh|Xm - 256-bit result vmov d3,d0 @ Xm is rotated Xl veor q0,q1,q10 vext.8 q10,q0,q0,#8 @ 2nd phase of reduction -.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11 + INST(0x26,0x0e,0xa0,0xf2) @ pmull q0,q0,q11 veor q10,q10,q2 veor q0,q0,q10 diff --git a/sys/crypto/openssl/arm/keccak1600-armv4.S b/sys/crypto/openssl/arm/keccak1600-armv4.S --- a/sys/crypto/openssl/arm/keccak1600-armv4.S +++ b/sys/crypto/openssl/arm/keccak1600-armv4.S @@ -1,8 +1,6 @@ /* Do not modify. This file is auto-generated from keccak1600-armv4.pl. */ #include "arm_arch.h" -.text - #if defined(__thumb2__) .syntax unified .thumb @@ -10,6 +8,8 @@ .code 32 #endif +.text + .type iotas32, %object .align 5 iotas32: @@ -1826,7 +1826,14 @@ #endif blo .Lround2x +#if __ARM_ARCH__>=5 ldr pc,[sp,#440] +#else + ldr lr,[sp,#440] + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet +.word 0xe12fff1e @ interoperable with Thumb ISA:-) +#endif .size KeccakF1600_int,.-KeccakF1600_int .type KeccakF1600, %function @@ -1865,7 +1872,14 @@ stmia r11, {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} add sp,sp,#440+20 +#if __ARM_ARCH__>=5 ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc} +#else + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet +.word 0xe12fff1e @ interoperable with Thumb ISA:-) +#endif .size KeccakF1600,.-KeccakF1600 .globl SHA3_absorb .type SHA3_absorb,%function @@ -2011,7 +2025,14 @@ .Labsorb_abort: add sp,sp,#456+32 mov r0,r12 @ return value +#if __ARM_ARCH__>=5 ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc} +#else + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet +.word 0xe12fff1e @ interoperable with Thumb ISA:-) +#endif .size SHA3_absorb,.-SHA3_absorb .globl SHA3_squeeze .type SHA3_squeeze,%function @@ -2156,7 +2177,14 @@ .align 4 .Lsqueeze_done: add sp,sp,#24 +#if __ARM_ARCH__>=5 ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,pc} +#else + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet +.word 0xe12fff1e @ interoperable with Thumb ISA:-) +#endif .size SHA3_squeeze,.-SHA3_squeeze #if __ARM_MAX_ARCH__>=7 .fpu neon @@ -2362,7 +2390,7 @@ subs r3, r3, #1 bne .Loop_neon -.word 0xe12fff1e + bx lr .size KeccakF1600_neon,.-KeccakF1600_neon .globl SHA3_absorb_neon diff --git a/sys/crypto/openssl/arm/poly1305-armv4.S b/sys/crypto/openssl/arm/poly1305-armv4.S --- a/sys/crypto/openssl/arm/poly1305-armv4.S +++ b/sys/crypto/openssl/arm/poly1305-armv4.S @@ -1,7 +1,6 @@ /* Do not modify. This file is auto-generated from poly1305-armv4.pl. */ #include "arm_arch.h" -.text #if defined(__thumb2__) .syntax unified .thumb @@ -9,6 +8,8 @@ .code 32 #endif +.text + .globl poly1305_emit .globl poly1305_blocks .globl poly1305_init @@ -53,8 +54,10 @@ and r4,r4,r10 #if __ARM_MAX_ARCH__>=7 +# if !defined(_WIN32) ldr r12,[r11,r12] @ OPENSSL_armcap_P -# ifdef __APPLE__ +# endif +# if defined(__APPLE__) || defined(_WIN32) ldr r12,[r12] # endif #endif @@ -69,32 +72,22 @@ #if __ARM_MAX_ARCH__>=7 tst r12,#ARMV7_NEON @ check for NEON -# ifdef __APPLE__ - adr r9,poly1305_blocks_neon - adr r11,poly1305_blocks -# ifdef __thumb2__ - it ne -# endif +# ifdef __thumb2__ + adr r9,.Lpoly1305_blocks_neon + adr r11,.Lpoly1305_blocks + adr r12,.Lpoly1305_emit + adr r10,.Lpoly1305_emit_neon + itt ne movne r11,r9 - adr r12,poly1305_emit - adr r10,poly1305_emit_neon -# ifdef __thumb2__ - it ne -# endif movne r12,r10 + orr r11,r11,#1 @ thumb-ify address + orr r12,r12,#1 # else -# ifdef __thumb2__ - itete eq -# endif addeq r12,r11,#(.Lpoly1305_emit-.Lpoly1305_init) addne r12,r11,#(.Lpoly1305_emit_neon-.Lpoly1305_init) addeq r11,r11,#(.Lpoly1305_blocks-.Lpoly1305_init) addne r11,r11,#(.Lpoly1305_blocks_neon-.Lpoly1305_init) # endif -# ifdef __thumb2__ - orr r12,r12,#1 @ thumb-ify address - orr r11,r11,#1 -# endif #endif ldrb r9,[r1,#11] orr r6,r6,r7,lsl#8 @@ -1162,7 +1155,11 @@ .Lzeros: .long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 .LOPENSSL_armcap: +# ifdef _WIN32 +.word OPENSSL_armcap_P +# else .word OPENSSL_armcap_P-.Lpoly1305_init +# endif #endif .byte 80,111,108,121,49,51,48,53,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 2 diff --git a/sys/crypto/openssl/arm/sha1-armv4-large.S b/sys/crypto/openssl/arm/sha1-armv4-large.S --- a/sys/crypto/openssl/arm/sha1-armv4-large.S +++ b/sys/crypto/openssl/arm/sha1-armv4-large.S @@ -1,7 +1,6 @@ /* Do not modify. This file is auto-generated from sha1-armv4-large.pl. */ #include "arm_arch.h" -.text #if defined(__thumb2__) .syntax unified .thumb @@ -9,6 +8,8 @@ .code 32 #endif +.text + .globl sha1_block_data_order .type sha1_block_data_order,%function @@ -16,12 +17,14 @@ sha1_block_data_order: #if __ARM_MAX_ARCH__>=7 .Lsha1_block: - adr r3,.Lsha1_block ldr r12,.LOPENSSL_armcap +# if !defined(_WIN32) + adr r3,.Lsha1_block ldr r12,[r3,r12] @ OPENSSL_armcap_P -#ifdef __APPLE__ +# endif +# if defined(__APPLE__) || defined(_WIN32) ldr r12,[r12] -#endif +# endif tst r12,#ARMV8_SHA1 bne .LARMv8 tst r12,#ARMV7_NEON @@ -486,7 +489,11 @@ .LK_60_79:.word 0xca62c1d6 #if __ARM_MAX_ARCH__>=7 .LOPENSSL_armcap: +# ifdef _WIN32 +.word OPENSSL_armcap_P +# else .word OPENSSL_armcap_P-.Lsha1_block +# endif #endif .byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,47,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 2 diff --git a/sys/crypto/openssl/arm/sha256-armv4.S b/sys/crypto/openssl/arm/sha256-armv4.S --- a/sys/crypto/openssl/arm/sha256-armv4.S +++ b/sys/crypto/openssl/arm/sha256-armv4.S @@ -1,7 +1,7 @@ /* Do not modify. This file is auto-generated from sha256-armv4.pl. */ @ Copyright 2007-2020 The OpenSSL Project Authors. All Rights Reserved. @ -@ Licensed under the OpenSSL license (the "License"). You may not use +@ Licensed under the Apache License 2.0 (the "License"). You may not use @ this file except in compliance with the License. You can obtain a copy @ in the file LICENSE in the source distribution or at @ https://www.openssl.org/source/license.html @@ -44,6 +44,8 @@ @ @ Add ARMv8 code path performing at 2.0 cpb on Apple A7. +@ $output is the last argument if it looks like a file (it has an extension) +@ $flavour is the first argument if it doesn't look like a file #ifndef __KERNEL__ # include "arm_arch.h" #else @@ -51,7 +53,6 @@ # define __ARM_MAX_ARCH__ 7 #endif -.text #if defined(__thumb2__) .syntax unified .thumb @@ -59,6 +60,8 @@ .code 32 #endif +.text + .type K256,%object .align 5 K256: @@ -82,7 +85,11 @@ .word 0 @ terminator #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) .LOPENSSL_armcap: +# ifdef _WIN32 +.word OPENSSL_armcap_P +# else .word OPENSSL_armcap_P-.Lsha256_block_data_order +# endif #endif .align 5 @@ -97,10 +104,12 @@ #endif #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) ldr r12,.LOPENSSL_armcap +# if !defined(_WIN32) ldr r12,[r3,r12] @ OPENSSL_armcap_P -#ifdef __APPLE__ +# endif +# if defined(__APPLE__) || defined(_WIN32) ldr r12,[r12] -#endif +# endif tst r12,#ARMV8_SHA256 bne .LARMv8 tst r12,#ARMV7_NEON diff --git a/sys/crypto/openssl/arm/sha512-armv4.S b/sys/crypto/openssl/arm/sha512-armv4.S --- a/sys/crypto/openssl/arm/sha512-armv4.S +++ b/sys/crypto/openssl/arm/sha512-armv4.S @@ -1,7 +1,7 @@ /* Do not modify. This file is auto-generated from sha512-armv4.pl. */ @ Copyright 2007-2020 The OpenSSL Project Authors. All Rights Reserved. @ -@ Licensed under the OpenSSL license (the "License"). You may not use +@ Licensed under the Apache License 2.0 (the "License"). You may not use @ this file except in compliance with the License. You can obtain a copy @ in the file LICENSE in the source distribution or at @ https://www.openssl.org/source/license.html @@ -74,7 +74,6 @@ # define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1 #endif -.text #if defined(__thumb2__) .syntax unified .thumb @@ -83,6 +82,8 @@ .code 32 #endif +.text + .type K512,%object .align 5 K512: @@ -129,7 +130,11 @@ .size K512,.-K512 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) .LOPENSSL_armcap: +# ifdef _WIN32 +.word OPENSSL_armcap_P +# else .word OPENSSL_armcap_P-.Lsha512_block_data_order +# endif .skip 32-4 #else .skip 32 @@ -146,10 +151,12 @@ #endif #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) ldr r12,.LOPENSSL_armcap +# if !defined(_WIN32) ldr r12,[r3,r12] @ OPENSSL_armcap_P -#ifdef __APPLE__ +# endif +# if defined(__APPLE__) || defined(_WIN32) ldr r12,[r12] -#endif +# endif tst r12,#ARMV7_NEON bne .LNEON #endif diff --git a/sys/crypto/openssl/i386/aes-586.S b/sys/crypto/openssl/i386/aes-586.S new file mode 100644 --- /dev/null +++ b/sys/crypto/openssl/i386/aes-586.S @@ -0,0 +1,6644 @@ +/* Do not modify. This file is auto-generated from aes-586.pl. */ +#ifdef PIC +.text +.type _x86_AES_encrypt_compact,@function +.align 16 +_x86_AES_encrypt_compact: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + + movl %edi,20(%esp) + xorl (%edi),%eax + xorl 4(%edi),%ebx + xorl 8(%edi),%ecx + xorl 12(%edi),%edx + movl 240(%edi),%esi + leal -2(%esi,%esi,1),%esi + leal (%edi,%esi,8),%esi + movl %esi,24(%esp) + movl -128(%ebp),%edi + movl -96(%ebp),%esi + movl -64(%ebp),%edi + movl -32(%ebp),%esi + movl (%ebp),%edi + movl 32(%ebp),%esi + movl 64(%ebp),%edi + movl 96(%ebp),%esi +.align 16 +.L000loop: + movl %eax,%esi + andl $255,%esi + movzbl -128(%ebp,%esi,1),%esi + movzbl %bh,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $8,%edi + xorl %edi,%esi + movl %ecx,%edi + shrl $16,%edi + andl $255,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $16,%edi + xorl %edi,%esi + movl %edx,%edi + shrl $24,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $24,%edi + xorl %edi,%esi + movl %esi,4(%esp) + + movl %ebx,%esi + andl $255,%esi + shrl $16,%ebx + movzbl -128(%ebp,%esi,1),%esi + movzbl %ch,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $8,%edi + xorl %edi,%esi + movl %edx,%edi + shrl $16,%edi + andl $255,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $16,%edi + xorl %edi,%esi + movl %eax,%edi + shrl $24,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $24,%edi + xorl %edi,%esi + movl %esi,8(%esp) + + movl %ecx,%esi + andl $255,%esi + shrl $24,%ecx + movzbl -128(%ebp,%esi,1),%esi + movzbl %dh,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $8,%edi + xorl %edi,%esi + movl %eax,%edi + shrl $16,%edi + andl $255,%edx + andl $255,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $16,%edi + xorl %edi,%esi + movzbl %bh,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $24,%edi + xorl %edi,%esi + + andl $255,%edx + movzbl -128(%ebp,%edx,1),%edx + movzbl %ah,%eax + movzbl -128(%ebp,%eax,1),%eax + shll $8,%eax + xorl %eax,%edx + movl 4(%esp),%eax + andl $255,%ebx + movzbl -128(%ebp,%ebx,1),%ebx + shll $16,%ebx + xorl %ebx,%edx + movl 8(%esp),%ebx + movzbl -128(%ebp,%ecx,1),%ecx + shll $24,%ecx + xorl %ecx,%edx + movl %esi,%ecx + + movl $2155905152,%ebp + andl %ecx,%ebp + leal (%ecx,%ecx,1),%edi + movl %ebp,%esi + shrl $7,%ebp + andl $4278124286,%edi + subl %ebp,%esi + movl %ecx,%ebp + andl $454761243,%esi + rorl $16,%ebp + xorl %edi,%esi + movl %ecx,%edi + xorl %esi,%ecx + rorl $24,%edi + xorl %ebp,%esi + roll $24,%ecx + xorl %edi,%esi + movl $2155905152,%ebp + xorl %esi,%ecx + andl %edx,%ebp + leal (%edx,%edx,1),%edi + movl %ebp,%esi + shrl $7,%ebp + andl $4278124286,%edi + subl %ebp,%esi + movl %edx,%ebp + andl $454761243,%esi + rorl $16,%ebp + xorl %edi,%esi + movl %edx,%edi + xorl %esi,%edx + rorl $24,%edi + xorl %ebp,%esi + roll $24,%edx + xorl %edi,%esi + movl $2155905152,%ebp + xorl %esi,%edx + andl %eax,%ebp + leal (%eax,%eax,1),%edi + movl %ebp,%esi + shrl $7,%ebp + andl $4278124286,%edi + subl %ebp,%esi + movl %eax,%ebp + andl $454761243,%esi + rorl $16,%ebp + xorl %edi,%esi + movl %eax,%edi + xorl %esi,%eax + rorl $24,%edi + xorl %ebp,%esi + roll $24,%eax + xorl %edi,%esi + movl $2155905152,%ebp + xorl %esi,%eax + andl %ebx,%ebp + leal (%ebx,%ebx,1),%edi + movl %ebp,%esi + shrl $7,%ebp + andl $4278124286,%edi + subl %ebp,%esi + movl %ebx,%ebp + andl $454761243,%esi + rorl $16,%ebp + xorl %edi,%esi + movl %ebx,%edi + xorl %esi,%ebx + rorl $24,%edi + xorl %ebp,%esi + roll $24,%ebx + xorl %edi,%esi + xorl %esi,%ebx + movl 20(%esp),%edi + movl 28(%esp),%ebp + addl $16,%edi + xorl (%edi),%eax + xorl 4(%edi),%ebx + xorl 8(%edi),%ecx + xorl 12(%edi),%edx + cmpl 24(%esp),%edi + movl %edi,20(%esp) + jb .L000loop + movl %eax,%esi + andl $255,%esi + movzbl -128(%ebp,%esi,1),%esi + movzbl %bh,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $8,%edi + xorl %edi,%esi + movl %ecx,%edi + shrl $16,%edi + andl $255,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $16,%edi + xorl %edi,%esi + movl %edx,%edi + shrl $24,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $24,%edi + xorl %edi,%esi + movl %esi,4(%esp) + + movl %ebx,%esi + andl $255,%esi + shrl $16,%ebx + movzbl -128(%ebp,%esi,1),%esi + movzbl %ch,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $8,%edi + xorl %edi,%esi + movl %edx,%edi + shrl $16,%edi + andl $255,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $16,%edi + xorl %edi,%esi + movl %eax,%edi + shrl $24,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $24,%edi + xorl %edi,%esi + movl %esi,8(%esp) + + movl %ecx,%esi + andl $255,%esi + shrl $24,%ecx + movzbl -128(%ebp,%esi,1),%esi + movzbl %dh,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $8,%edi + xorl %edi,%esi + movl %eax,%edi + shrl $16,%edi + andl $255,%edx + andl $255,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $16,%edi + xorl %edi,%esi + movzbl %bh,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $24,%edi + xorl %edi,%esi + + movl 20(%esp),%edi + andl $255,%edx + movzbl -128(%ebp,%edx,1),%edx + movzbl %ah,%eax + movzbl -128(%ebp,%eax,1),%eax + shll $8,%eax + xorl %eax,%edx + movl 4(%esp),%eax + andl $255,%ebx + movzbl -128(%ebp,%ebx,1),%ebx + shll $16,%ebx + xorl %ebx,%edx + movl 8(%esp),%ebx + movzbl -128(%ebp,%ecx,1),%ecx + shll $24,%ecx + xorl %ecx,%edx + movl %esi,%ecx + + xorl 16(%edi),%eax + xorl 20(%edi),%ebx + xorl 24(%edi),%ecx + xorl 28(%edi),%edx + ret +.size _x86_AES_encrypt_compact,.-_x86_AES_encrypt_compact +.type _sse_AES_encrypt_compact,@function +.align 16 +_sse_AES_encrypt_compact: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + + pxor (%edi),%mm0 + pxor 8(%edi),%mm4 + movl 240(%edi),%esi + leal -2(%esi,%esi,1),%esi + leal (%edi,%esi,8),%esi + movl %esi,24(%esp) + movl $454761243,%eax + movl %eax,8(%esp) + movl %eax,12(%esp) + movl -128(%ebp),%eax + movl -96(%ebp),%ebx + movl -64(%ebp),%ecx + movl -32(%ebp),%edx + movl (%ebp),%eax + movl 32(%ebp),%ebx + movl 64(%ebp),%ecx + movl 96(%ebp),%edx +.align 16 +.L001loop: + pshufw $8,%mm0,%mm1 + pshufw $13,%mm4,%mm5 + movd %mm1,%eax + movd %mm5,%ebx + movl %edi,20(%esp) + movzbl %al,%esi + movzbl %ah,%edx + pshufw $13,%mm0,%mm2 + movzbl -128(%ebp,%esi,1),%ecx + movzbl %bl,%edi + movzbl -128(%ebp,%edx,1),%edx + shrl $16,%eax + shll $8,%edx + movzbl -128(%ebp,%edi,1),%esi + movzbl %bh,%edi + shll $16,%esi + pshufw $8,%mm4,%mm6 + orl %esi,%ecx + movzbl -128(%ebp,%edi,1),%esi + movzbl %ah,%edi + shll $24,%esi + shrl $16,%ebx + orl %esi,%edx + movzbl -128(%ebp,%edi,1),%esi + movzbl %bh,%edi + shll $8,%esi + orl %esi,%ecx + movzbl -128(%ebp,%edi,1),%esi + movzbl %al,%edi + shll $24,%esi + orl %esi,%ecx + movzbl -128(%ebp,%edi,1),%esi + movzbl %bl,%edi + movd %mm2,%eax + movd %ecx,%mm0 + movzbl -128(%ebp,%edi,1),%ecx + movzbl %ah,%edi + shll $16,%ecx + movd %mm6,%ebx + orl %esi,%ecx + movzbl -128(%ebp,%edi,1),%esi + movzbl %bh,%edi + shll $24,%esi + orl %esi,%ecx + movzbl -128(%ebp,%edi,1),%esi + movzbl %bl,%edi + shll $8,%esi + shrl $16,%ebx + orl %esi,%ecx + movzbl -128(%ebp,%edi,1),%esi + movzbl %al,%edi + shrl $16,%eax + movd %ecx,%mm1 + movzbl -128(%ebp,%edi,1),%ecx + movzbl %ah,%edi + shll $16,%ecx + andl $255,%eax + orl %esi,%ecx + punpckldq %mm1,%mm0 + movzbl -128(%ebp,%edi,1),%esi + movzbl %bh,%edi + shll $24,%esi + andl $255,%ebx + movzbl -128(%ebp,%eax,1),%eax + orl %esi,%ecx + shll $16,%eax + movzbl -128(%ebp,%edi,1),%esi + orl %eax,%edx + shll $8,%esi + movzbl -128(%ebp,%ebx,1),%ebx + orl %esi,%ecx + orl %ebx,%edx + movl 20(%esp),%edi + movd %ecx,%mm4 + movd %edx,%mm5 + punpckldq %mm5,%mm4 + addl $16,%edi + cmpl 24(%esp),%edi + ja .L002out + movq 8(%esp),%mm2 + pxor %mm3,%mm3 + pxor %mm7,%mm7 + movq %mm0,%mm1 + movq %mm4,%mm5 + pcmpgtb %mm0,%mm3 + pcmpgtb %mm4,%mm7 + pand %mm2,%mm3 + pand %mm2,%mm7 + pshufw $177,%mm0,%mm2 + pshufw $177,%mm4,%mm6 + paddb %mm0,%mm0 + paddb %mm4,%mm4 + pxor %mm3,%mm0 + pxor %mm7,%mm4 + pshufw $177,%mm2,%mm3 + pshufw $177,%mm6,%mm7 + pxor %mm0,%mm1 + pxor %mm4,%mm5 + pxor %mm2,%mm0 + pxor %mm6,%mm4 + movq %mm3,%mm2 + movq %mm7,%mm6 + pslld $8,%mm3 + pslld $8,%mm7 + psrld $24,%mm2 + psrld $24,%mm6 + pxor %mm3,%mm0 + pxor %mm7,%mm4 + pxor %mm2,%mm0 + pxor %mm6,%mm4 + movq %mm1,%mm3 + movq %mm5,%mm7 + movq (%edi),%mm2 + movq 8(%edi),%mm6 + psrld $8,%mm1 + psrld $8,%mm5 + movl -128(%ebp),%eax + pslld $24,%mm3 + pslld $24,%mm7 + movl -64(%ebp),%ebx + pxor %mm1,%mm0 + pxor %mm5,%mm4 + movl (%ebp),%ecx + pxor %mm3,%mm0 + pxor %mm7,%mm4 + movl 64(%ebp),%edx + pxor %mm2,%mm0 + pxor %mm6,%mm4 + jmp .L001loop +.align 16 +.L002out: + pxor (%edi),%mm0 + pxor 8(%edi),%mm4 + ret +.size _sse_AES_encrypt_compact,.-_sse_AES_encrypt_compact +.type _x86_AES_encrypt,@function +.align 16 +_x86_AES_encrypt: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + + movl %edi,20(%esp) + xorl (%edi),%eax + xorl 4(%edi),%ebx + xorl 8(%edi),%ecx + xorl 12(%edi),%edx + movl 240(%edi),%esi + leal -2(%esi,%esi,1),%esi + leal (%edi,%esi,8),%esi + movl %esi,24(%esp) +.align 16 +.L003loop: + movl %eax,%esi + andl $255,%esi + movl (%ebp,%esi,8),%esi + movzbl %bh,%edi + xorl 3(%ebp,%edi,8),%esi + movl %ecx,%edi + shrl $16,%edi + andl $255,%edi + xorl 2(%ebp,%edi,8),%esi + movl %edx,%edi + shrl $24,%edi + xorl 1(%ebp,%edi,8),%esi + movl %esi,4(%esp) + + movl %ebx,%esi + andl $255,%esi + shrl $16,%ebx + movl (%ebp,%esi,8),%esi + movzbl %ch,%edi + xorl 3(%ebp,%edi,8),%esi + movl %edx,%edi + shrl $16,%edi + andl $255,%edi + xorl 2(%ebp,%edi,8),%esi + movl %eax,%edi + shrl $24,%edi + xorl 1(%ebp,%edi,8),%esi + movl %esi,8(%esp) + + movl %ecx,%esi + andl $255,%esi + shrl $24,%ecx + movl (%ebp,%esi,8),%esi + movzbl %dh,%edi + xorl 3(%ebp,%edi,8),%esi + movl %eax,%edi + shrl $16,%edi + andl $255,%edx + andl $255,%edi + xorl 2(%ebp,%edi,8),%esi + movzbl %bh,%edi + xorl 1(%ebp,%edi,8),%esi + + movl 20(%esp),%edi + movl (%ebp,%edx,8),%edx + movzbl %ah,%eax + xorl 3(%ebp,%eax,8),%edx + movl 4(%esp),%eax + andl $255,%ebx + xorl 2(%ebp,%ebx,8),%edx + movl 8(%esp),%ebx + xorl 1(%ebp,%ecx,8),%edx + movl %esi,%ecx + + addl $16,%edi + xorl (%edi),%eax + xorl 4(%edi),%ebx + xorl 8(%edi),%ecx + xorl 12(%edi),%edx + cmpl 24(%esp),%edi + movl %edi,20(%esp) + jb .L003loop + movl %eax,%esi + andl $255,%esi + movl 2(%ebp,%esi,8),%esi + andl $255,%esi + movzbl %bh,%edi + movl (%ebp,%edi,8),%edi + andl $65280,%edi + xorl %edi,%esi + movl %ecx,%edi + shrl $16,%edi + andl $255,%edi + movl (%ebp,%edi,8),%edi + andl $16711680,%edi + xorl %edi,%esi + movl %edx,%edi + shrl $24,%edi + movl 2(%ebp,%edi,8),%edi + andl $4278190080,%edi + xorl %edi,%esi + movl %esi,4(%esp) + movl %ebx,%esi + andl $255,%esi + shrl $16,%ebx + movl 2(%ebp,%esi,8),%esi + andl $255,%esi + movzbl %ch,%edi + movl (%ebp,%edi,8),%edi + andl $65280,%edi + xorl %edi,%esi + movl %edx,%edi + shrl $16,%edi + andl $255,%edi + movl (%ebp,%edi,8),%edi + andl $16711680,%edi + xorl %edi,%esi + movl %eax,%edi + shrl $24,%edi + movl 2(%ebp,%edi,8),%edi + andl $4278190080,%edi + xorl %edi,%esi + movl %esi,8(%esp) + movl %ecx,%esi + andl $255,%esi + shrl $24,%ecx + movl 2(%ebp,%esi,8),%esi + andl $255,%esi + movzbl %dh,%edi + movl (%ebp,%edi,8),%edi + andl $65280,%edi + xorl %edi,%esi + movl %eax,%edi + shrl $16,%edi + andl $255,%edx + andl $255,%edi + movl (%ebp,%edi,8),%edi + andl $16711680,%edi + xorl %edi,%esi + movzbl %bh,%edi + movl 2(%ebp,%edi,8),%edi + andl $4278190080,%edi + xorl %edi,%esi + movl 20(%esp),%edi + andl $255,%edx + movl 2(%ebp,%edx,8),%edx + andl $255,%edx + movzbl %ah,%eax + movl (%ebp,%eax,8),%eax + andl $65280,%eax + xorl %eax,%edx + movl 4(%esp),%eax + andl $255,%ebx + movl (%ebp,%ebx,8),%ebx + andl $16711680,%ebx + xorl %ebx,%edx + movl 8(%esp),%ebx + movl 2(%ebp,%ecx,8),%ecx + andl $4278190080,%ecx + xorl %ecx,%edx + movl %esi,%ecx + addl $16,%edi + xorl (%edi),%eax + xorl 4(%edi),%ebx + xorl 8(%edi),%ecx + xorl 12(%edi),%edx + ret +.align 64 +.LAES_Te: +.long 2774754246,2774754246 +.long 2222750968,2222750968 +.long 2574743534,2574743534 +.long 2373680118,2373680118 +.long 234025727,234025727 +.long 3177933782,3177933782 +.long 2976870366,2976870366 +.long 1422247313,1422247313 +.long 1345335392,1345335392 +.long 50397442,50397442 +.long 2842126286,2842126286 +.long 2099981142,2099981142 +.long 436141799,436141799 +.long 1658312629,1658312629 +.long 3870010189,3870010189 +.long 2591454956,2591454956 +.long 1170918031,1170918031 +.long 2642575903,2642575903 +.long 1086966153,1086966153 +.long 2273148410,2273148410 +.long 368769775,368769775 +.long 3948501426,3948501426 +.long 3376891790,3376891790 +.long 200339707,200339707 +.long 3970805057,3970805057 +.long 1742001331,1742001331 +.long 4255294047,4255294047 +.long 3937382213,3937382213 +.long 3214711843,3214711843 +.long 4154762323,4154762323 +.long 2524082916,2524082916 +.long 1539358875,1539358875 +.long 3266819957,3266819957 +.long 486407649,486407649 +.long 2928907069,2928907069 +.long 1780885068,1780885068 +.long 1513502316,1513502316 +.long 1094664062,1094664062 +.long 49805301,49805301 +.long 1338821763,1338821763 +.long 1546925160,1546925160 +.long 4104496465,4104496465 +.long 887481809,887481809 +.long 150073849,150073849 +.long 2473685474,2473685474 +.long 1943591083,1943591083 +.long 1395732834,1395732834 +.long 1058346282,1058346282 +.long 201589768,201589768 +.long 1388824469,1388824469 +.long 1696801606,1696801606 +.long 1589887901,1589887901 +.long 672667696,672667696 +.long 2711000631,2711000631 +.long 251987210,251987210 +.long 3046808111,3046808111 +.long 151455502,151455502 +.long 907153956,907153956 +.long 2608889883,2608889883 +.long 1038279391,1038279391 +.long 652995533,652995533 +.long 1764173646,1764173646 +.long 3451040383,3451040383 +.long 2675275242,2675275242 +.long 453576978,453576978 +.long 2659418909,2659418909 +.long 1949051992,1949051992 +.long 773462580,773462580 +.long 756751158,756751158 +.long 2993581788,2993581788 +.long 3998898868,3998898868 +.long 4221608027,4221608027 +.long 4132590244,4132590244 +.long 1295727478,1295727478 +.long 1641469623,1641469623 +.long 3467883389,3467883389 +.long 2066295122,2066295122 +.long 1055122397,1055122397 +.long 1898917726,1898917726 +.long 2542044179,2542044179 +.long 4115878822,4115878822 +.long 1758581177,1758581177 +.long 0,0 +.long 753790401,753790401 +.long 1612718144,1612718144 +.long 536673507,536673507 +.long 3367088505,3367088505 +.long 3982187446,3982187446 +.long 3194645204,3194645204 +.long 1187761037,1187761037 +.long 3653156455,3653156455 +.long 1262041458,1262041458 +.long 3729410708,3729410708 +.long 3561770136,3561770136 +.long 3898103984,3898103984 +.long 1255133061,1255133061 +.long 1808847035,1808847035 +.long 720367557,720367557 +.long 3853167183,3853167183 +.long 385612781,385612781 +.long 3309519750,3309519750 +.long 3612167578,3612167578 +.long 1429418854,1429418854 +.long 2491778321,2491778321 +.long 3477423498,3477423498 +.long 284817897,284817897 +.long 100794884,100794884 +.long 2172616702,2172616702 +.long 4031795360,4031795360 +.long 1144798328,1144798328 +.long 3131023141,3131023141 +.long 3819481163,3819481163 +.long 4082192802,4082192802 +.long 4272137053,4272137053 +.long 3225436288,3225436288 +.long 2324664069,2324664069 +.long 2912064063,2912064063 +.long 3164445985,3164445985 +.long 1211644016,1211644016 +.long 83228145,83228145 +.long 3753688163,3753688163 +.long 3249976951,3249976951 +.long 1977277103,1977277103 +.long 1663115586,1663115586 +.long 806359072,806359072 +.long 452984805,452984805 +.long 250868733,250868733 +.long 1842533055,1842533055 +.long 1288555905,1288555905 +.long 336333848,336333848 +.long 890442534,890442534 +.long 804056259,804056259 +.long 3781124030,3781124030 +.long 2727843637,2727843637 +.long 3427026056,3427026056 +.long 957814574,957814574 +.long 1472513171,1472513171 +.long 4071073621,4071073621 +.long 2189328124,2189328124 +.long 1195195770,1195195770 +.long 2892260552,2892260552 +.long 3881655738,3881655738 +.long 723065138,723065138 +.long 2507371494,2507371494 +.long 2690670784,2690670784 +.long 2558624025,2558624025 +.long 3511635870,3511635870 +.long 2145180835,2145180835 +.long 1713513028,1713513028 +.long 2116692564,2116692564 +.long 2878378043,2878378043 +.long 2206763019,2206763019 +.long 3393603212,3393603212 +.long 703524551,703524551 +.long 3552098411,3552098411 +.long 1007948840,1007948840 +.long 2044649127,2044649127 +.long 3797835452,3797835452 +.long 487262998,487262998 +.long 1994120109,1994120109 +.long 1004593371,1004593371 +.long 1446130276,1446130276 +.long 1312438900,1312438900 +.long 503974420,503974420 +.long 3679013266,3679013266 +.long 168166924,168166924 +.long 1814307912,1814307912 +.long 3831258296,3831258296 +.long 1573044895,1573044895 +.long 1859376061,1859376061 +.long 4021070915,4021070915 +.long 2791465668,2791465668 +.long 2828112185,2828112185 +.long 2761266481,2761266481 +.long 937747667,937747667 +.long 2339994098,2339994098 +.long 854058965,854058965 +.long 1137232011,1137232011 +.long 1496790894,1496790894 +.long 3077402074,3077402074 +.long 2358086913,2358086913 +.long 1691735473,1691735473 +.long 3528347292,3528347292 +.long 3769215305,3769215305 +.long 3027004632,3027004632 +.long 4199962284,4199962284 +.long 133494003,133494003 +.long 636152527,636152527 +.long 2942657994,2942657994 +.long 2390391540,2390391540 +.long 3920539207,3920539207 +.long 403179536,403179536 +.long 3585784431,3585784431 +.long 2289596656,2289596656 +.long 1864705354,1864705354 +.long 1915629148,1915629148 +.long 605822008,605822008 +.long 4054230615,4054230615 +.long 3350508659,3350508659 +.long 1371981463,1371981463 +.long 602466507,602466507 +.long 2094914977,2094914977 +.long 2624877800,2624877800 +.long 555687742,555687742 +.long 3712699286,3712699286 +.long 3703422305,3703422305 +.long 2257292045,2257292045 +.long 2240449039,2240449039 +.long 2423288032,2423288032 +.long 1111375484,1111375484 +.long 3300242801,3300242801 +.long 2858837708,2858837708 +.long 3628615824,3628615824 +.long 84083462,84083462 +.long 32962295,32962295 +.long 302911004,302911004 +.long 2741068226,2741068226 +.long 1597322602,1597322602 +.long 4183250862,4183250862 +.long 3501832553,3501832553 +.long 2441512471,2441512471 +.long 1489093017,1489093017 +.long 656219450,656219450 +.long 3114180135,3114180135 +.long 954327513,954327513 +.long 335083755,335083755 +.long 3013122091,3013122091 +.long 856756514,856756514 +.long 3144247762,3144247762 +.long 1893325225,1893325225 +.long 2307821063,2307821063 +.long 2811532339,2811532339 +.long 3063651117,3063651117 +.long 572399164,572399164 +.long 2458355477,2458355477 +.long 552200649,552200649 +.long 1238290055,1238290055 +.long 4283782570,4283782570 +.long 2015897680,2015897680 +.long 2061492133,2061492133 +.long 2408352771,2408352771 +.long 4171342169,4171342169 +.long 2156497161,2156497161 +.long 386731290,386731290 +.long 3669999461,3669999461 +.long 837215959,837215959 +.long 3326231172,3326231172 +.long 3093850320,3093850320 +.long 3275833730,3275833730 +.long 2962856233,2962856233 +.long 1999449434,1999449434 +.long 286199582,286199582 +.long 3417354363,3417354363 +.long 4233385128,4233385128 +.long 3602627437,3602627437 +.long 974525996,974525996 +.byte 99,124,119,123,242,107,111,197 +.byte 48,1,103,43,254,215,171,118 +.byte 202,130,201,125,250,89,71,240 +.byte 173,212,162,175,156,164,114,192 +.byte 183,253,147,38,54,63,247,204 +.byte 52,165,229,241,113,216,49,21 +.byte 4,199,35,195,24,150,5,154 +.byte 7,18,128,226,235,39,178,117 +.byte 9,131,44,26,27,110,90,160 +.byte 82,59,214,179,41,227,47,132 +.byte 83,209,0,237,32,252,177,91 +.byte 106,203,190,57,74,76,88,207 +.byte 208,239,170,251,67,77,51,133 +.byte 69,249,2,127,80,60,159,168 +.byte 81,163,64,143,146,157,56,245 +.byte 188,182,218,33,16,255,243,210 +.byte 205,12,19,236,95,151,68,23 +.byte 196,167,126,61,100,93,25,115 +.byte 96,129,79,220,34,42,144,136 +.byte 70,238,184,20,222,94,11,219 +.byte 224,50,58,10,73,6,36,92 +.byte 194,211,172,98,145,149,228,121 +.byte 231,200,55,109,141,213,78,169 +.byte 108,86,244,234,101,122,174,8 +.byte 186,120,37,46,28,166,180,198 +.byte 232,221,116,31,75,189,139,138 +.byte 112,62,181,102,72,3,246,14 +.byte 97,53,87,185,134,193,29,158 +.byte 225,248,152,17,105,217,142,148 +.byte 155,30,135,233,206,85,40,223 +.byte 140,161,137,13,191,230,66,104 +.byte 65,153,45,15,176,84,187,22 +.byte 99,124,119,123,242,107,111,197 +.byte 48,1,103,43,254,215,171,118 +.byte 202,130,201,125,250,89,71,240 +.byte 173,212,162,175,156,164,114,192 +.byte 183,253,147,38,54,63,247,204 +.byte 52,165,229,241,113,216,49,21 +.byte 4,199,35,195,24,150,5,154 +.byte 7,18,128,226,235,39,178,117 +.byte 9,131,44,26,27,110,90,160 +.byte 82,59,214,179,41,227,47,132 +.byte 83,209,0,237,32,252,177,91 +.byte 106,203,190,57,74,76,88,207 +.byte 208,239,170,251,67,77,51,133 +.byte 69,249,2,127,80,60,159,168 +.byte 81,163,64,143,146,157,56,245 +.byte 188,182,218,33,16,255,243,210 +.byte 205,12,19,236,95,151,68,23 +.byte 196,167,126,61,100,93,25,115 +.byte 96,129,79,220,34,42,144,136 +.byte 70,238,184,20,222,94,11,219 +.byte 224,50,58,10,73,6,36,92 +.byte 194,211,172,98,145,149,228,121 +.byte 231,200,55,109,141,213,78,169 +.byte 108,86,244,234,101,122,174,8 +.byte 186,120,37,46,28,166,180,198 +.byte 232,221,116,31,75,189,139,138 +.byte 112,62,181,102,72,3,246,14 +.byte 97,53,87,185,134,193,29,158 +.byte 225,248,152,17,105,217,142,148 +.byte 155,30,135,233,206,85,40,223 +.byte 140,161,137,13,191,230,66,104 +.byte 65,153,45,15,176,84,187,22 +.byte 99,124,119,123,242,107,111,197 +.byte 48,1,103,43,254,215,171,118 +.byte 202,130,201,125,250,89,71,240 +.byte 173,212,162,175,156,164,114,192 +.byte 183,253,147,38,54,63,247,204 +.byte 52,165,229,241,113,216,49,21 +.byte 4,199,35,195,24,150,5,154 +.byte 7,18,128,226,235,39,178,117 +.byte 9,131,44,26,27,110,90,160 +.byte 82,59,214,179,41,227,47,132 +.byte 83,209,0,237,32,252,177,91 +.byte 106,203,190,57,74,76,88,207 +.byte 208,239,170,251,67,77,51,133 +.byte 69,249,2,127,80,60,159,168 +.byte 81,163,64,143,146,157,56,245 +.byte 188,182,218,33,16,255,243,210 +.byte 205,12,19,236,95,151,68,23 +.byte 196,167,126,61,100,93,25,115 +.byte 96,129,79,220,34,42,144,136 +.byte 70,238,184,20,222,94,11,219 +.byte 224,50,58,10,73,6,36,92 +.byte 194,211,172,98,145,149,228,121 +.byte 231,200,55,109,141,213,78,169 +.byte 108,86,244,234,101,122,174,8 +.byte 186,120,37,46,28,166,180,198 +.byte 232,221,116,31,75,189,139,138 +.byte 112,62,181,102,72,3,246,14 +.byte 97,53,87,185,134,193,29,158 +.byte 225,248,152,17,105,217,142,148 +.byte 155,30,135,233,206,85,40,223 +.byte 140,161,137,13,191,230,66,104 +.byte 65,153,45,15,176,84,187,22 +.byte 99,124,119,123,242,107,111,197 +.byte 48,1,103,43,254,215,171,118 +.byte 202,130,201,125,250,89,71,240 +.byte 173,212,162,175,156,164,114,192 +.byte 183,253,147,38,54,63,247,204 +.byte 52,165,229,241,113,216,49,21 +.byte 4,199,35,195,24,150,5,154 +.byte 7,18,128,226,235,39,178,117 +.byte 9,131,44,26,27,110,90,160 +.byte 82,59,214,179,41,227,47,132 +.byte 83,209,0,237,32,252,177,91 +.byte 106,203,190,57,74,76,88,207 +.byte 208,239,170,251,67,77,51,133 +.byte 69,249,2,127,80,60,159,168 +.byte 81,163,64,143,146,157,56,245 +.byte 188,182,218,33,16,255,243,210 +.byte 205,12,19,236,95,151,68,23 +.byte 196,167,126,61,100,93,25,115 +.byte 96,129,79,220,34,42,144,136 +.byte 70,238,184,20,222,94,11,219 +.byte 224,50,58,10,73,6,36,92 +.byte 194,211,172,98,145,149,228,121 +.byte 231,200,55,109,141,213,78,169 +.byte 108,86,244,234,101,122,174,8 +.byte 186,120,37,46,28,166,180,198 +.byte 232,221,116,31,75,189,139,138 +.byte 112,62,181,102,72,3,246,14 +.byte 97,53,87,185,134,193,29,158 +.byte 225,248,152,17,105,217,142,148 +.byte 155,30,135,233,206,85,40,223 +.byte 140,161,137,13,191,230,66,104 +.byte 65,153,45,15,176,84,187,22 +.long 1,2,4,8 +.long 16,32,64,128 +.long 27,54,0,0 +.long 0,0,0,0 +.size _x86_AES_encrypt,.-_x86_AES_encrypt +.globl AES_encrypt +.type AES_encrypt,@function +.align 16 +AES_encrypt: +.L_AES_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%esi + movl 28(%esp),%edi + movl %esp,%eax + subl $36,%esp + andl $-64,%esp + leal -127(%edi),%ebx + subl %esp,%ebx + negl %ebx + andl $960,%ebx + subl %ebx,%esp + addl $4,%esp + movl %eax,28(%esp) + call .L004pic_point +.L004pic_point: + popl %ebp + leal OPENSSL_ia32cap_P-.L004pic_point(%ebp),%eax + leal .LAES_Te-.L004pic_point(%ebp),%ebp + leal 764(%esp),%ebx + subl %ebp,%ebx + andl $768,%ebx + leal 2176(%ebp,%ebx,1),%ebp + btl $25,(%eax) + jnc .L005x86 + movq (%esi),%mm0 + movq 8(%esi),%mm4 + call _sse_AES_encrypt_compact + movl 28(%esp),%esp + movl 24(%esp),%esi + movq %mm0,(%esi) + movq %mm4,8(%esi) + emms + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.align 16 +.L005x86: + movl %ebp,24(%esp) + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edx + call _x86_AES_encrypt_compact + movl 28(%esp),%esp + movl 24(%esp),%esi + movl %eax,(%esi) + movl %ebx,4(%esi) + movl %ecx,8(%esi) + movl %edx,12(%esi) + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size AES_encrypt,.-.L_AES_encrypt_begin +.type _x86_AES_decrypt_compact,@function +.align 16 +_x86_AES_decrypt_compact: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + + movl %edi,20(%esp) + xorl (%edi),%eax + xorl 4(%edi),%ebx + xorl 8(%edi),%ecx + xorl 12(%edi),%edx + movl 240(%edi),%esi + leal -2(%esi,%esi,1),%esi + leal (%edi,%esi,8),%esi + movl %esi,24(%esp) + movl -128(%ebp),%edi + movl -96(%ebp),%esi + movl -64(%ebp),%edi + movl -32(%ebp),%esi + movl (%ebp),%edi + movl 32(%ebp),%esi + movl 64(%ebp),%edi + movl 96(%ebp),%esi +.align 16 +.L006loop: + movl %eax,%esi + andl $255,%esi + movzbl -128(%ebp,%esi,1),%esi + movzbl %dh,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $8,%edi + xorl %edi,%esi + movl %ecx,%edi + shrl $16,%edi + andl $255,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $16,%edi + xorl %edi,%esi + movl %ebx,%edi + shrl $24,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $24,%edi + xorl %edi,%esi + movl %esi,4(%esp) + movl %ebx,%esi + andl $255,%esi + movzbl -128(%ebp,%esi,1),%esi + movzbl %ah,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $8,%edi + xorl %edi,%esi + movl %edx,%edi + shrl $16,%edi + andl $255,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $16,%edi + xorl %edi,%esi + movl %ecx,%edi + shrl $24,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $24,%edi + xorl %edi,%esi + movl %esi,8(%esp) + movl %ecx,%esi + andl $255,%esi + movzbl -128(%ebp,%esi,1),%esi + movzbl %bh,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $8,%edi + xorl %edi,%esi + movl %eax,%edi + shrl $16,%edi + andl $255,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $16,%edi + xorl %edi,%esi + movl %edx,%edi + shrl $24,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $24,%edi + xorl %edi,%esi + andl $255,%edx + movzbl -128(%ebp,%edx,1),%edx + movzbl %ch,%ecx + movzbl -128(%ebp,%ecx,1),%ecx + shll $8,%ecx + xorl %ecx,%edx + movl %esi,%ecx + shrl $16,%ebx + andl $255,%ebx + movzbl -128(%ebp,%ebx,1),%ebx + shll $16,%ebx + xorl %ebx,%edx + shrl $24,%eax + movzbl -128(%ebp,%eax,1),%eax + shll $24,%eax + xorl %eax,%edx + movl $2155905152,%edi + andl %ecx,%edi + movl %edi,%esi + shrl $7,%edi + leal (%ecx,%ecx,1),%eax + subl %edi,%esi + andl $4278124286,%eax + andl $454761243,%esi + xorl %esi,%eax + movl $2155905152,%edi + andl %eax,%edi + movl %edi,%esi + shrl $7,%edi + leal (%eax,%eax,1),%ebx + subl %edi,%esi + andl $4278124286,%ebx + andl $454761243,%esi + xorl %ecx,%eax + xorl %esi,%ebx + movl $2155905152,%edi + andl %ebx,%edi + movl %edi,%esi + shrl $7,%edi + leal (%ebx,%ebx,1),%ebp + subl %edi,%esi + andl $4278124286,%ebp + andl $454761243,%esi + xorl %ecx,%ebx + roll $8,%ecx + xorl %esi,%ebp + xorl %eax,%ecx + xorl %ebp,%eax + xorl %ebx,%ecx + xorl %ebp,%ebx + roll $24,%eax + xorl %ebp,%ecx + roll $16,%ebx + xorl %eax,%ecx + roll $8,%ebp + xorl %ebx,%ecx + movl 4(%esp),%eax + xorl %ebp,%ecx + movl %ecx,12(%esp) + movl $2155905152,%edi + andl %edx,%edi + movl %edi,%esi + shrl $7,%edi + leal (%edx,%edx,1),%ebx + subl %edi,%esi + andl $4278124286,%ebx + andl $454761243,%esi + xorl %esi,%ebx + movl $2155905152,%edi + andl %ebx,%edi + movl %edi,%esi + shrl $7,%edi + leal (%ebx,%ebx,1),%ecx + subl %edi,%esi + andl $4278124286,%ecx + andl $454761243,%esi + xorl %edx,%ebx + xorl %esi,%ecx + movl $2155905152,%edi + andl %ecx,%edi + movl %edi,%esi + shrl $7,%edi + leal (%ecx,%ecx,1),%ebp + subl %edi,%esi + andl $4278124286,%ebp + andl $454761243,%esi + xorl %edx,%ecx + roll $8,%edx + xorl %esi,%ebp + xorl %ebx,%edx + xorl %ebp,%ebx + xorl %ecx,%edx + xorl %ebp,%ecx + roll $24,%ebx + xorl %ebp,%edx + roll $16,%ecx + xorl %ebx,%edx + roll $8,%ebp + xorl %ecx,%edx + movl 8(%esp),%ebx + xorl %ebp,%edx + movl %edx,16(%esp) + movl $2155905152,%edi + andl %eax,%edi + movl %edi,%esi + shrl $7,%edi + leal (%eax,%eax,1),%ecx + subl %edi,%esi + andl $4278124286,%ecx + andl $454761243,%esi + xorl %esi,%ecx + movl $2155905152,%edi + andl %ecx,%edi + movl %edi,%esi + shrl $7,%edi + leal (%ecx,%ecx,1),%edx + subl %edi,%esi + andl $4278124286,%edx + andl $454761243,%esi + xorl %eax,%ecx + xorl %esi,%edx + movl $2155905152,%edi + andl %edx,%edi + movl %edi,%esi + shrl $7,%edi + leal (%edx,%edx,1),%ebp + subl %edi,%esi + andl $4278124286,%ebp + andl $454761243,%esi + xorl %eax,%edx + roll $8,%eax + xorl %esi,%ebp + xorl %ecx,%eax + xorl %ebp,%ecx + xorl %edx,%eax + xorl %ebp,%edx + roll $24,%ecx + xorl %ebp,%eax + roll $16,%edx + xorl %ecx,%eax + roll $8,%ebp + xorl %edx,%eax + xorl %ebp,%eax + movl $2155905152,%edi + andl %ebx,%edi + movl %edi,%esi + shrl $7,%edi + leal (%ebx,%ebx,1),%ecx + subl %edi,%esi + andl $4278124286,%ecx + andl $454761243,%esi + xorl %esi,%ecx + movl $2155905152,%edi + andl %ecx,%edi + movl %edi,%esi + shrl $7,%edi + leal (%ecx,%ecx,1),%edx + subl %edi,%esi + andl $4278124286,%edx + andl $454761243,%esi + xorl %ebx,%ecx + xorl %esi,%edx + movl $2155905152,%edi + andl %edx,%edi + movl %edi,%esi + shrl $7,%edi + leal (%edx,%edx,1),%ebp + subl %edi,%esi + andl $4278124286,%ebp + andl $454761243,%esi + xorl %ebx,%edx + roll $8,%ebx + xorl %esi,%ebp + xorl %ecx,%ebx + xorl %ebp,%ecx + xorl %edx,%ebx + xorl %ebp,%edx + roll $24,%ecx + xorl %ebp,%ebx + roll $16,%edx + xorl %ecx,%ebx + roll $8,%ebp + xorl %edx,%ebx + movl 12(%esp),%ecx + xorl %ebp,%ebx + movl 16(%esp),%edx + movl 20(%esp),%edi + movl 28(%esp),%ebp + addl $16,%edi + xorl (%edi),%eax + xorl 4(%edi),%ebx + xorl 8(%edi),%ecx + xorl 12(%edi),%edx + cmpl 24(%esp),%edi + movl %edi,20(%esp) + jb .L006loop + movl %eax,%esi + andl $255,%esi + movzbl -128(%ebp,%esi,1),%esi + movzbl %dh,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $8,%edi + xorl %edi,%esi + movl %ecx,%edi + shrl $16,%edi + andl $255,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $16,%edi + xorl %edi,%esi + movl %ebx,%edi + shrl $24,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $24,%edi + xorl %edi,%esi + movl %esi,4(%esp) + movl %ebx,%esi + andl $255,%esi + movzbl -128(%ebp,%esi,1),%esi + movzbl %ah,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $8,%edi + xorl %edi,%esi + movl %edx,%edi + shrl $16,%edi + andl $255,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $16,%edi + xorl %edi,%esi + movl %ecx,%edi + shrl $24,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $24,%edi + xorl %edi,%esi + movl %esi,8(%esp) + movl %ecx,%esi + andl $255,%esi + movzbl -128(%ebp,%esi,1),%esi + movzbl %bh,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $8,%edi + xorl %edi,%esi + movl %eax,%edi + shrl $16,%edi + andl $255,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $16,%edi + xorl %edi,%esi + movl %edx,%edi + shrl $24,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $24,%edi + xorl %edi,%esi + movl 20(%esp),%edi + andl $255,%edx + movzbl -128(%ebp,%edx,1),%edx + movzbl %ch,%ecx + movzbl -128(%ebp,%ecx,1),%ecx + shll $8,%ecx + xorl %ecx,%edx + movl %esi,%ecx + shrl $16,%ebx + andl $255,%ebx + movzbl -128(%ebp,%ebx,1),%ebx + shll $16,%ebx + xorl %ebx,%edx + movl 8(%esp),%ebx + shrl $24,%eax + movzbl -128(%ebp,%eax,1),%eax + shll $24,%eax + xorl %eax,%edx + movl 4(%esp),%eax + xorl 16(%edi),%eax + xorl 20(%edi),%ebx + xorl 24(%edi),%ecx + xorl 28(%edi),%edx + ret +.size _x86_AES_decrypt_compact,.-_x86_AES_decrypt_compact +.type _sse_AES_decrypt_compact,@function +.align 16 +_sse_AES_decrypt_compact: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + + pxor (%edi),%mm0 + pxor 8(%edi),%mm4 + movl 240(%edi),%esi + leal -2(%esi,%esi,1),%esi + leal (%edi,%esi,8),%esi + movl %esi,24(%esp) + movl $454761243,%eax + movl %eax,8(%esp) + movl %eax,12(%esp) + movl -128(%ebp),%eax + movl -96(%ebp),%ebx + movl -64(%ebp),%ecx + movl -32(%ebp),%edx + movl (%ebp),%eax + movl 32(%ebp),%ebx + movl 64(%ebp),%ecx + movl 96(%ebp),%edx +.align 16 +.L007loop: + pshufw $12,%mm0,%mm1 + pshufw $9,%mm4,%mm5 + movd %mm1,%eax + movd %mm5,%ebx + movl %edi,20(%esp) + movzbl %al,%esi + movzbl %ah,%edx + pshufw $6,%mm0,%mm2 + movzbl -128(%ebp,%esi,1),%ecx + movzbl %bl,%edi + movzbl -128(%ebp,%edx,1),%edx + shrl $16,%eax + shll $8,%edx + movzbl -128(%ebp,%edi,1),%esi + movzbl %bh,%edi + shll $16,%esi + pshufw $3,%mm4,%mm6 + orl %esi,%ecx + movzbl -128(%ebp,%edi,1),%esi + movzbl %ah,%edi + shll $24,%esi + shrl $16,%ebx + orl %esi,%edx + movzbl -128(%ebp,%edi,1),%esi + movzbl %bh,%edi + shll $24,%esi + orl %esi,%ecx + movzbl -128(%ebp,%edi,1),%esi + movzbl %al,%edi + shll $8,%esi + movd %mm2,%eax + orl %esi,%ecx + movzbl -128(%ebp,%edi,1),%esi + movzbl %bl,%edi + shll $16,%esi + movd %mm6,%ebx + movd %ecx,%mm0 + movzbl -128(%ebp,%edi,1),%ecx + movzbl %al,%edi + orl %esi,%ecx + movzbl -128(%ebp,%edi,1),%esi + movzbl %bl,%edi + orl %esi,%edx + movzbl -128(%ebp,%edi,1),%esi + movzbl %ah,%edi + shll $16,%esi + shrl $16,%eax + orl %esi,%edx + movzbl -128(%ebp,%edi,1),%esi + movzbl %bh,%edi + shrl $16,%ebx + shll $8,%esi + movd %edx,%mm1 + movzbl -128(%ebp,%edi,1),%edx + movzbl %bh,%edi + shll $24,%edx + andl $255,%ebx + orl %esi,%edx + punpckldq %mm1,%mm0 + movzbl -128(%ebp,%edi,1),%esi + movzbl %al,%edi + shll $8,%esi + movzbl %ah,%eax + movzbl -128(%ebp,%ebx,1),%ebx + orl %esi,%ecx + movzbl -128(%ebp,%edi,1),%esi + orl %ebx,%edx + shll $16,%esi + movzbl -128(%ebp,%eax,1),%eax + orl %esi,%edx + shll $24,%eax + orl %eax,%ecx + movl 20(%esp),%edi + movd %edx,%mm4 + movd %ecx,%mm5 + punpckldq %mm5,%mm4 + addl $16,%edi + cmpl 24(%esp),%edi + ja .L008out + movq %mm0,%mm3 + movq %mm4,%mm7 + pshufw $228,%mm0,%mm2 + pshufw $228,%mm4,%mm6 + movq %mm0,%mm1 + movq %mm4,%mm5 + pshufw $177,%mm0,%mm0 + pshufw $177,%mm4,%mm4 + pslld $8,%mm2 + pslld $8,%mm6 + psrld $8,%mm3 + psrld $8,%mm7 + pxor %mm2,%mm0 + pxor %mm6,%mm4 + pxor %mm3,%mm0 + pxor %mm7,%mm4 + pslld $16,%mm2 + pslld $16,%mm6 + psrld $16,%mm3 + psrld $16,%mm7 + pxor %mm2,%mm0 + pxor %mm6,%mm4 + pxor %mm3,%mm0 + pxor %mm7,%mm4 + movq 8(%esp),%mm3 + pxor %mm2,%mm2 + pxor %mm6,%mm6 + pcmpgtb %mm1,%mm2 + pcmpgtb %mm5,%mm6 + pand %mm3,%mm2 + pand %mm3,%mm6 + paddb %mm1,%mm1 + paddb %mm5,%mm5 + pxor %mm2,%mm1 + pxor %mm6,%mm5 + movq %mm1,%mm3 + movq %mm5,%mm7 + movq %mm1,%mm2 + movq %mm5,%mm6 + pxor %mm1,%mm0 + pxor %mm5,%mm4 + pslld $24,%mm3 + pslld $24,%mm7 + psrld $8,%mm2 + psrld $8,%mm6 + pxor %mm3,%mm0 + pxor %mm7,%mm4 + pxor %mm2,%mm0 + pxor %mm6,%mm4 + movq 8(%esp),%mm2 + pxor %mm3,%mm3 + pxor %mm7,%mm7 + pcmpgtb %mm1,%mm3 + pcmpgtb %mm5,%mm7 + pand %mm2,%mm3 + pand %mm2,%mm7 + paddb %mm1,%mm1 + paddb %mm5,%mm5 + pxor %mm3,%mm1 + pxor %mm7,%mm5 + pshufw $177,%mm1,%mm3 + pshufw $177,%mm5,%mm7 + pxor %mm1,%mm0 + pxor %mm5,%mm4 + pxor %mm3,%mm0 + pxor %mm7,%mm4 + pxor %mm3,%mm3 + pxor %mm7,%mm7 + pcmpgtb %mm1,%mm3 + pcmpgtb %mm5,%mm7 + pand %mm2,%mm3 + pand %mm2,%mm7 + paddb %mm1,%mm1 + paddb %mm5,%mm5 + pxor %mm3,%mm1 + pxor %mm7,%mm5 + pxor %mm1,%mm0 + pxor %mm5,%mm4 + movq %mm1,%mm3 + movq %mm5,%mm7 + pshufw $177,%mm1,%mm2 + pshufw $177,%mm5,%mm6 + pxor %mm2,%mm0 + pxor %mm6,%mm4 + pslld $8,%mm1 + pslld $8,%mm5 + psrld $8,%mm3 + psrld $8,%mm7 + movq (%edi),%mm2 + movq 8(%edi),%mm6 + pxor %mm1,%mm0 + pxor %mm5,%mm4 + pxor %mm3,%mm0 + pxor %mm7,%mm4 + movl -128(%ebp),%eax + pslld $16,%mm1 + pslld $16,%mm5 + movl -64(%ebp),%ebx + psrld $16,%mm3 + psrld $16,%mm7 + movl (%ebp),%ecx + pxor %mm1,%mm0 + pxor %mm5,%mm4 + movl 64(%ebp),%edx + pxor %mm3,%mm0 + pxor %mm7,%mm4 + pxor %mm2,%mm0 + pxor %mm6,%mm4 + jmp .L007loop +.align 16 +.L008out: + pxor (%edi),%mm0 + pxor 8(%edi),%mm4 + ret +.size _sse_AES_decrypt_compact,.-_sse_AES_decrypt_compact +.type _x86_AES_decrypt,@function +.align 16 +_x86_AES_decrypt: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + + movl %edi,20(%esp) + xorl (%edi),%eax + xorl 4(%edi),%ebx + xorl 8(%edi),%ecx + xorl 12(%edi),%edx + movl 240(%edi),%esi + leal -2(%esi,%esi,1),%esi + leal (%edi,%esi,8),%esi + movl %esi,24(%esp) +.align 16 +.L009loop: + movl %eax,%esi + andl $255,%esi + movl (%ebp,%esi,8),%esi + movzbl %dh,%edi + xorl 3(%ebp,%edi,8),%esi + movl %ecx,%edi + shrl $16,%edi + andl $255,%edi + xorl 2(%ebp,%edi,8),%esi + movl %ebx,%edi + shrl $24,%edi + xorl 1(%ebp,%edi,8),%esi + movl %esi,4(%esp) + + movl %ebx,%esi + andl $255,%esi + movl (%ebp,%esi,8),%esi + movzbl %ah,%edi + xorl 3(%ebp,%edi,8),%esi + movl %edx,%edi + shrl $16,%edi + andl $255,%edi + xorl 2(%ebp,%edi,8),%esi + movl %ecx,%edi + shrl $24,%edi + xorl 1(%ebp,%edi,8),%esi + movl %esi,8(%esp) + + movl %ecx,%esi + andl $255,%esi + movl (%ebp,%esi,8),%esi + movzbl %bh,%edi + xorl 3(%ebp,%edi,8),%esi + movl %eax,%edi + shrl $16,%edi + andl $255,%edi + xorl 2(%ebp,%edi,8),%esi + movl %edx,%edi + shrl $24,%edi + xorl 1(%ebp,%edi,8),%esi + + movl 20(%esp),%edi + andl $255,%edx + movl (%ebp,%edx,8),%edx + movzbl %ch,%ecx + xorl 3(%ebp,%ecx,8),%edx + movl %esi,%ecx + shrl $16,%ebx + andl $255,%ebx + xorl 2(%ebp,%ebx,8),%edx + movl 8(%esp),%ebx + shrl $24,%eax + xorl 1(%ebp,%eax,8),%edx + movl 4(%esp),%eax + + addl $16,%edi + xorl (%edi),%eax + xorl 4(%edi),%ebx + xorl 8(%edi),%ecx + xorl 12(%edi),%edx + cmpl 24(%esp),%edi + movl %edi,20(%esp) + jb .L009loop + leal 2176(%ebp),%ebp + movl -128(%ebp),%edi + movl -96(%ebp),%esi + movl -64(%ebp),%edi + movl -32(%ebp),%esi + movl (%ebp),%edi + movl 32(%ebp),%esi + movl 64(%ebp),%edi + movl 96(%ebp),%esi + leal -128(%ebp),%ebp + movl %eax,%esi + andl $255,%esi + movzbl (%ebp,%esi,1),%esi + movzbl %dh,%edi + movzbl (%ebp,%edi,1),%edi + shll $8,%edi + xorl %edi,%esi + movl %ecx,%edi + shrl $16,%edi + andl $255,%edi + movzbl (%ebp,%edi,1),%edi + shll $16,%edi + xorl %edi,%esi + movl %ebx,%edi + shrl $24,%edi + movzbl (%ebp,%edi,1),%edi + shll $24,%edi + xorl %edi,%esi + movl %esi,4(%esp) + movl %ebx,%esi + andl $255,%esi + movzbl (%ebp,%esi,1),%esi + movzbl %ah,%edi + movzbl (%ebp,%edi,1),%edi + shll $8,%edi + xorl %edi,%esi + movl %edx,%edi + shrl $16,%edi + andl $255,%edi + movzbl (%ebp,%edi,1),%edi + shll $16,%edi + xorl %edi,%esi + movl %ecx,%edi + shrl $24,%edi + movzbl (%ebp,%edi,1),%edi + shll $24,%edi + xorl %edi,%esi + movl %esi,8(%esp) + movl %ecx,%esi + andl $255,%esi + movzbl (%ebp,%esi,1),%esi + movzbl %bh,%edi + movzbl (%ebp,%edi,1),%edi + shll $8,%edi + xorl %edi,%esi + movl %eax,%edi + shrl $16,%edi + andl $255,%edi + movzbl (%ebp,%edi,1),%edi + shll $16,%edi + xorl %edi,%esi + movl %edx,%edi + shrl $24,%edi + movzbl (%ebp,%edi,1),%edi + shll $24,%edi + xorl %edi,%esi + movl 20(%esp),%edi + andl $255,%edx + movzbl (%ebp,%edx,1),%edx + movzbl %ch,%ecx + movzbl (%ebp,%ecx,1),%ecx + shll $8,%ecx + xorl %ecx,%edx + movl %esi,%ecx + shrl $16,%ebx + andl $255,%ebx + movzbl (%ebp,%ebx,1),%ebx + shll $16,%ebx + xorl %ebx,%edx + movl 8(%esp),%ebx + shrl $24,%eax + movzbl (%ebp,%eax,1),%eax + shll $24,%eax + xorl %eax,%edx + movl 4(%esp),%eax + leal -2048(%ebp),%ebp + addl $16,%edi + xorl (%edi),%eax + xorl 4(%edi),%ebx + xorl 8(%edi),%ecx + xorl 12(%edi),%edx + ret +.align 64 +.LAES_Td: +.long 1353184337,1353184337 +.long 1399144830,1399144830 +.long 3282310938,3282310938 +.long 2522752826,2522752826 +.long 3412831035,3412831035 +.long 4047871263,4047871263 +.long 2874735276,2874735276 +.long 2466505547,2466505547 +.long 1442459680,1442459680 +.long 4134368941,4134368941 +.long 2440481928,2440481928 +.long 625738485,625738485 +.long 4242007375,4242007375 +.long 3620416197,3620416197 +.long 2151953702,2151953702 +.long 2409849525,2409849525 +.long 1230680542,1230680542 +.long 1729870373,1729870373 +.long 2551114309,2551114309 +.long 3787521629,3787521629 +.long 41234371,41234371 +.long 317738113,317738113 +.long 2744600205,2744600205 +.long 3338261355,3338261355 +.long 3881799427,3881799427 +.long 2510066197,2510066197 +.long 3950669247,3950669247 +.long 3663286933,3663286933 +.long 763608788,763608788 +.long 3542185048,3542185048 +.long 694804553,694804553 +.long 1154009486,1154009486 +.long 1787413109,1787413109 +.long 2021232372,2021232372 +.long 1799248025,1799248025 +.long 3715217703,3715217703 +.long 3058688446,3058688446 +.long 397248752,397248752 +.long 1722556617,1722556617 +.long 3023752829,3023752829 +.long 407560035,407560035 +.long 2184256229,2184256229 +.long 1613975959,1613975959 +.long 1165972322,1165972322 +.long 3765920945,3765920945 +.long 2226023355,2226023355 +.long 480281086,480281086 +.long 2485848313,2485848313 +.long 1483229296,1483229296 +.long 436028815,436028815 +.long 2272059028,2272059028 +.long 3086515026,3086515026 +.long 601060267,601060267 +.long 3791801202,3791801202 +.long 1468997603,1468997603 +.long 715871590,715871590 +.long 120122290,120122290 +.long 63092015,63092015 +.long 2591802758,2591802758 +.long 2768779219,2768779219 +.long 4068943920,4068943920 +.long 2997206819,2997206819 +.long 3127509762,3127509762 +.long 1552029421,1552029421 +.long 723308426,723308426 +.long 2461301159,2461301159 +.long 4042393587,4042393587 +.long 2715969870,2715969870 +.long 3455375973,3455375973 +.long 3586000134,3586000134 +.long 526529745,526529745 +.long 2331944644,2331944644 +.long 2639474228,2639474228 +.long 2689987490,2689987490 +.long 853641733,853641733 +.long 1978398372,1978398372 +.long 971801355,971801355 +.long 2867814464,2867814464 +.long 111112542,111112542 +.long 1360031421,1360031421 +.long 4186579262,4186579262 +.long 1023860118,1023860118 +.long 2919579357,2919579357 +.long 1186850381,1186850381 +.long 3045938321,3045938321 +.long 90031217,90031217 +.long 1876166148,1876166148 +.long 4279586912,4279586912 +.long 620468249,620468249 +.long 2548678102,2548678102 +.long 3426959497,3426959497 +.long 2006899047,2006899047 +.long 3175278768,3175278768 +.long 2290845959,2290845959 +.long 945494503,945494503 +.long 3689859193,3689859193 +.long 1191869601,1191869601 +.long 3910091388,3910091388 +.long 3374220536,3374220536 +.long 0,0 +.long 2206629897,2206629897 +.long 1223502642,1223502642 +.long 2893025566,2893025566 +.long 1316117100,1316117100 +.long 4227796733,4227796733 +.long 1446544655,1446544655 +.long 517320253,517320253 +.long 658058550,658058550 +.long 1691946762,1691946762 +.long 564550760,564550760 +.long 3511966619,3511966619 +.long 976107044,976107044 +.long 2976320012,2976320012 +.long 266819475,266819475 +.long 3533106868,3533106868 +.long 2660342555,2660342555 +.long 1338359936,1338359936 +.long 2720062561,2720062561 +.long 1766553434,1766553434 +.long 370807324,370807324 +.long 179999714,179999714 +.long 3844776128,3844776128 +.long 1138762300,1138762300 +.long 488053522,488053522 +.long 185403662,185403662 +.long 2915535858,2915535858 +.long 3114841645,3114841645 +.long 3366526484,3366526484 +.long 2233069911,2233069911 +.long 1275557295,1275557295 +.long 3151862254,3151862254 +.long 4250959779,4250959779 +.long 2670068215,2670068215 +.long 3170202204,3170202204 +.long 3309004356,3309004356 +.long 880737115,880737115 +.long 1982415755,1982415755 +.long 3703972811,3703972811 +.long 1761406390,1761406390 +.long 1676797112,1676797112 +.long 3403428311,3403428311 +.long 277177154,277177154 +.long 1076008723,1076008723 +.long 538035844,538035844 +.long 2099530373,2099530373 +.long 4164795346,4164795346 +.long 288553390,288553390 +.long 1839278535,1839278535 +.long 1261411869,1261411869 +.long 4080055004,4080055004 +.long 3964831245,3964831245 +.long 3504587127,3504587127 +.long 1813426987,1813426987 +.long 2579067049,2579067049 +.long 4199060497,4199060497 +.long 577038663,577038663 +.long 3297574056,3297574056 +.long 440397984,440397984 +.long 3626794326,3626794326 +.long 4019204898,4019204898 +.long 3343796615,3343796615 +.long 3251714265,3251714265 +.long 4272081548,4272081548 +.long 906744984,906744984 +.long 3481400742,3481400742 +.long 685669029,685669029 +.long 646887386,646887386 +.long 2764025151,2764025151 +.long 3835509292,3835509292 +.long 227702864,227702864 +.long 2613862250,2613862250 +.long 1648787028,1648787028 +.long 3256061430,3256061430 +.long 3904428176,3904428176 +.long 1593260334,1593260334 +.long 4121936770,4121936770 +.long 3196083615,3196083615 +.long 2090061929,2090061929 +.long 2838353263,2838353263 +.long 3004310991,3004310991 +.long 999926984,999926984 +.long 2809993232,2809993232 +.long 1852021992,1852021992 +.long 2075868123,2075868123 +.long 158869197,158869197 +.long 4095236462,4095236462 +.long 28809964,28809964 +.long 2828685187,2828685187 +.long 1701746150,1701746150 +.long 2129067946,2129067946 +.long 147831841,147831841 +.long 3873969647,3873969647 +.long 3650873274,3650873274 +.long 3459673930,3459673930 +.long 3557400554,3557400554 +.long 3598495785,3598495785 +.long 2947720241,2947720241 +.long 824393514,824393514 +.long 815048134,815048134 +.long 3227951669,3227951669 +.long 935087732,935087732 +.long 2798289660,2798289660 +.long 2966458592,2966458592 +.long 366520115,366520115 +.long 1251476721,1251476721 +.long 4158319681,4158319681 +.long 240176511,240176511 +.long 804688151,804688151 +.long 2379631990,2379631990 +.long 1303441219,1303441219 +.long 1414376140,1414376140 +.long 3741619940,3741619940 +.long 3820343710,3820343710 +.long 461924940,461924940 +.long 3089050817,3089050817 +.long 2136040774,2136040774 +.long 82468509,82468509 +.long 1563790337,1563790337 +.long 1937016826,1937016826 +.long 776014843,776014843 +.long 1511876531,1511876531 +.long 1389550482,1389550482 +.long 861278441,861278441 +.long 323475053,323475053 +.long 2355222426,2355222426 +.long 2047648055,2047648055 +.long 2383738969,2383738969 +.long 2302415851,2302415851 +.long 3995576782,3995576782 +.long 902390199,902390199 +.long 3991215329,3991215329 +.long 1018251130,1018251130 +.long 1507840668,1507840668 +.long 1064563285,1064563285 +.long 2043548696,2043548696 +.long 3208103795,3208103795 +.long 3939366739,3939366739 +.long 1537932639,1537932639 +.long 342834655,342834655 +.long 2262516856,2262516856 +.long 2180231114,2180231114 +.long 1053059257,1053059257 +.long 741614648,741614648 +.long 1598071746,1598071746 +.long 1925389590,1925389590 +.long 203809468,203809468 +.long 2336832552,2336832552 +.long 1100287487,1100287487 +.long 1895934009,1895934009 +.long 3736275976,3736275976 +.long 2632234200,2632234200 +.long 2428589668,2428589668 +.long 1636092795,1636092795 +.long 1890988757,1890988757 +.long 1952214088,1952214088 +.long 1113045200,1113045200 +.byte 82,9,106,213,48,54,165,56 +.byte 191,64,163,158,129,243,215,251 +.byte 124,227,57,130,155,47,255,135 +.byte 52,142,67,68,196,222,233,203 +.byte 84,123,148,50,166,194,35,61 +.byte 238,76,149,11,66,250,195,78 +.byte 8,46,161,102,40,217,36,178 +.byte 118,91,162,73,109,139,209,37 +.byte 114,248,246,100,134,104,152,22 +.byte 212,164,92,204,93,101,182,146 +.byte 108,112,72,80,253,237,185,218 +.byte 94,21,70,87,167,141,157,132 +.byte 144,216,171,0,140,188,211,10 +.byte 247,228,88,5,184,179,69,6 +.byte 208,44,30,143,202,63,15,2 +.byte 193,175,189,3,1,19,138,107 +.byte 58,145,17,65,79,103,220,234 +.byte 151,242,207,206,240,180,230,115 +.byte 150,172,116,34,231,173,53,133 +.byte 226,249,55,232,28,117,223,110 +.byte 71,241,26,113,29,41,197,137 +.byte 111,183,98,14,170,24,190,27 +.byte 252,86,62,75,198,210,121,32 +.byte 154,219,192,254,120,205,90,244 +.byte 31,221,168,51,136,7,199,49 +.byte 177,18,16,89,39,128,236,95 +.byte 96,81,127,169,25,181,74,13 +.byte 45,229,122,159,147,201,156,239 +.byte 160,224,59,77,174,42,245,176 +.byte 200,235,187,60,131,83,153,97 +.byte 23,43,4,126,186,119,214,38 +.byte 225,105,20,99,85,33,12,125 +.byte 82,9,106,213,48,54,165,56 +.byte 191,64,163,158,129,243,215,251 +.byte 124,227,57,130,155,47,255,135 +.byte 52,142,67,68,196,222,233,203 +.byte 84,123,148,50,166,194,35,61 +.byte 238,76,149,11,66,250,195,78 +.byte 8,46,161,102,40,217,36,178 +.byte 118,91,162,73,109,139,209,37 +.byte 114,248,246,100,134,104,152,22 +.byte 212,164,92,204,93,101,182,146 +.byte 108,112,72,80,253,237,185,218 +.byte 94,21,70,87,167,141,157,132 +.byte 144,216,171,0,140,188,211,10 +.byte 247,228,88,5,184,179,69,6 +.byte 208,44,30,143,202,63,15,2 +.byte 193,175,189,3,1,19,138,107 +.byte 58,145,17,65,79,103,220,234 +.byte 151,242,207,206,240,180,230,115 +.byte 150,172,116,34,231,173,53,133 +.byte 226,249,55,232,28,117,223,110 +.byte 71,241,26,113,29,41,197,137 +.byte 111,183,98,14,170,24,190,27 +.byte 252,86,62,75,198,210,121,32 +.byte 154,219,192,254,120,205,90,244 +.byte 31,221,168,51,136,7,199,49 +.byte 177,18,16,89,39,128,236,95 +.byte 96,81,127,169,25,181,74,13 +.byte 45,229,122,159,147,201,156,239 +.byte 160,224,59,77,174,42,245,176 +.byte 200,235,187,60,131,83,153,97 +.byte 23,43,4,126,186,119,214,38 +.byte 225,105,20,99,85,33,12,125 +.byte 82,9,106,213,48,54,165,56 +.byte 191,64,163,158,129,243,215,251 +.byte 124,227,57,130,155,47,255,135 +.byte 52,142,67,68,196,222,233,203 +.byte 84,123,148,50,166,194,35,61 +.byte 238,76,149,11,66,250,195,78 +.byte 8,46,161,102,40,217,36,178 +.byte 118,91,162,73,109,139,209,37 +.byte 114,248,246,100,134,104,152,22 +.byte 212,164,92,204,93,101,182,146 +.byte 108,112,72,80,253,237,185,218 +.byte 94,21,70,87,167,141,157,132 +.byte 144,216,171,0,140,188,211,10 +.byte 247,228,88,5,184,179,69,6 +.byte 208,44,30,143,202,63,15,2 +.byte 193,175,189,3,1,19,138,107 +.byte 58,145,17,65,79,103,220,234 +.byte 151,242,207,206,240,180,230,115 +.byte 150,172,116,34,231,173,53,133 +.byte 226,249,55,232,28,117,223,110 +.byte 71,241,26,113,29,41,197,137 +.byte 111,183,98,14,170,24,190,27 +.byte 252,86,62,75,198,210,121,32 +.byte 154,219,192,254,120,205,90,244 +.byte 31,221,168,51,136,7,199,49 +.byte 177,18,16,89,39,128,236,95 +.byte 96,81,127,169,25,181,74,13 +.byte 45,229,122,159,147,201,156,239 +.byte 160,224,59,77,174,42,245,176 +.byte 200,235,187,60,131,83,153,97 +.byte 23,43,4,126,186,119,214,38 +.byte 225,105,20,99,85,33,12,125 +.byte 82,9,106,213,48,54,165,56 +.byte 191,64,163,158,129,243,215,251 +.byte 124,227,57,130,155,47,255,135 +.byte 52,142,67,68,196,222,233,203 +.byte 84,123,148,50,166,194,35,61 +.byte 238,76,149,11,66,250,195,78 +.byte 8,46,161,102,40,217,36,178 +.byte 118,91,162,73,109,139,209,37 +.byte 114,248,246,100,134,104,152,22 +.byte 212,164,92,204,93,101,182,146 +.byte 108,112,72,80,253,237,185,218 +.byte 94,21,70,87,167,141,157,132 +.byte 144,216,171,0,140,188,211,10 +.byte 247,228,88,5,184,179,69,6 +.byte 208,44,30,143,202,63,15,2 +.byte 193,175,189,3,1,19,138,107 +.byte 58,145,17,65,79,103,220,234 +.byte 151,242,207,206,240,180,230,115 +.byte 150,172,116,34,231,173,53,133 +.byte 226,249,55,232,28,117,223,110 +.byte 71,241,26,113,29,41,197,137 +.byte 111,183,98,14,170,24,190,27 +.byte 252,86,62,75,198,210,121,32 +.byte 154,219,192,254,120,205,90,244 +.byte 31,221,168,51,136,7,199,49 +.byte 177,18,16,89,39,128,236,95 +.byte 96,81,127,169,25,181,74,13 +.byte 45,229,122,159,147,201,156,239 +.byte 160,224,59,77,174,42,245,176 +.byte 200,235,187,60,131,83,153,97 +.byte 23,43,4,126,186,119,214,38 +.byte 225,105,20,99,85,33,12,125 +.size _x86_AES_decrypt,.-_x86_AES_decrypt +.globl AES_decrypt +.type AES_decrypt,@function +.align 16 +AES_decrypt: +.L_AES_decrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%esi + movl 28(%esp),%edi + movl %esp,%eax + subl $36,%esp + andl $-64,%esp + leal -127(%edi),%ebx + subl %esp,%ebx + negl %ebx + andl $960,%ebx + subl %ebx,%esp + addl $4,%esp + movl %eax,28(%esp) + call .L010pic_point +.L010pic_point: + popl %ebp + leal OPENSSL_ia32cap_P-.L010pic_point(%ebp),%eax + leal .LAES_Td-.L010pic_point(%ebp),%ebp + leal 764(%esp),%ebx + subl %ebp,%ebx + andl $768,%ebx + leal 2176(%ebp,%ebx,1),%ebp + btl $25,(%eax) + jnc .L011x86 + movq (%esi),%mm0 + movq 8(%esi),%mm4 + call _sse_AES_decrypt_compact + movl 28(%esp),%esp + movl 24(%esp),%esi + movq %mm0,(%esi) + movq %mm4,8(%esi) + emms + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.align 16 +.L011x86: + movl %ebp,24(%esp) + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edx + call _x86_AES_decrypt_compact + movl 28(%esp),%esp + movl 24(%esp),%esi + movl %eax,(%esi) + movl %ebx,4(%esi) + movl %ecx,8(%esi) + movl %edx,12(%esi) + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size AES_decrypt,.-.L_AES_decrypt_begin +.globl AES_cbc_encrypt +.type AES_cbc_encrypt,@function +.align 16 +AES_cbc_encrypt: +.L_AES_cbc_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 28(%esp),%ecx + cmpl $0,%ecx + je .L012drop_out + call .L013pic_point +.L013pic_point: + popl %ebp + leal OPENSSL_ia32cap_P-.L013pic_point(%ebp),%eax + cmpl $0,40(%esp) + leal .LAES_Te-.L013pic_point(%ebp),%ebp + jne .L014picked_te + leal .LAES_Td-.LAES_Te(%ebp),%ebp +.L014picked_te: + pushfl + cld + cmpl $512,%ecx + jb .L015slow_way + testl $15,%ecx + jnz .L015slow_way + btl $28,(%eax) + jc .L015slow_way + leal -324(%esp),%esi + andl $-64,%esi + movl %ebp,%eax + leal 2304(%ebp),%ebx + movl %esi,%edx + andl $4095,%eax + andl $4095,%ebx + andl $4095,%edx + cmpl %ebx,%edx + jb .L016tbl_break_out + subl %ebx,%edx + subl %edx,%esi + jmp .L017tbl_ok +.align 4 +.L016tbl_break_out: + subl %eax,%edx + andl $4095,%edx + addl $384,%edx + subl %edx,%esi +.align 4 +.L017tbl_ok: + leal 24(%esp),%edx + xchgl %esi,%esp + addl $4,%esp + movl %ebp,24(%esp) + movl %esi,28(%esp) + movl (%edx),%eax + movl 4(%edx),%ebx + movl 12(%edx),%edi + movl 16(%edx),%esi + movl 20(%edx),%edx + movl %eax,32(%esp) + movl %ebx,36(%esp) + movl %ecx,40(%esp) + movl %edi,44(%esp) + movl %esi,48(%esp) + movl $0,316(%esp) + movl %edi,%ebx + movl $61,%ecx + subl %ebp,%ebx + movl %edi,%esi + andl $4095,%ebx + leal 76(%esp),%edi + cmpl $2304,%ebx + jb .L018do_copy + cmpl $3852,%ebx + jb .L019skip_copy +.align 4 +.L018do_copy: + movl %edi,44(%esp) +.long 2784229001 +.L019skip_copy: + movl $16,%edi +.align 4 +.L020prefetch_tbl: + movl (%ebp),%eax + movl 32(%ebp),%ebx + movl 64(%ebp),%ecx + movl 96(%ebp),%esi + leal 128(%ebp),%ebp + subl $1,%edi + jnz .L020prefetch_tbl + subl $2048,%ebp + movl 32(%esp),%esi + movl 48(%esp),%edi + cmpl $0,%edx + je .L021fast_decrypt + movl (%edi),%eax + movl 4(%edi),%ebx +.align 16 +.L022fast_enc_loop: + movl 8(%edi),%ecx + movl 12(%edi),%edx + xorl (%esi),%eax + xorl 4(%esi),%ebx + xorl 8(%esi),%ecx + xorl 12(%esi),%edx + movl 44(%esp),%edi + call _x86_AES_encrypt + movl 32(%esp),%esi + movl 36(%esp),%edi + movl %eax,(%edi) + movl %ebx,4(%edi) + movl %ecx,8(%edi) + movl %edx,12(%edi) + leal 16(%esi),%esi + movl 40(%esp),%ecx + movl %esi,32(%esp) + leal 16(%edi),%edx + movl %edx,36(%esp) + subl $16,%ecx + movl %ecx,40(%esp) + jnz .L022fast_enc_loop + movl 48(%esp),%esi + movl 8(%edi),%ecx + movl 12(%edi),%edx + movl %eax,(%esi) + movl %ebx,4(%esi) + movl %ecx,8(%esi) + movl %edx,12(%esi) + cmpl $0,316(%esp) + movl 44(%esp),%edi + je .L023skip_ezero + movl $60,%ecx + xorl %eax,%eax +.align 4 +.long 2884892297 +.L023skip_ezero: + movl 28(%esp),%esp + popfl +.L012drop_out: + popl %edi + popl %esi + popl %ebx + popl %ebp + ret + pushfl +.align 16 +.L021fast_decrypt: + cmpl 36(%esp),%esi + je .L024fast_dec_in_place + movl %edi,52(%esp) +.align 4 +.align 16 +.L025fast_dec_loop: + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edx + movl 44(%esp),%edi + call _x86_AES_decrypt + movl 52(%esp),%edi + movl 40(%esp),%esi + xorl (%edi),%eax + xorl 4(%edi),%ebx + xorl 8(%edi),%ecx + xorl 12(%edi),%edx + movl 36(%esp),%edi + movl 32(%esp),%esi + movl %eax,(%edi) + movl %ebx,4(%edi) + movl %ecx,8(%edi) + movl %edx,12(%edi) + movl 40(%esp),%ecx + movl %esi,52(%esp) + leal 16(%esi),%esi + movl %esi,32(%esp) + leal 16(%edi),%edi + movl %edi,36(%esp) + subl $16,%ecx + movl %ecx,40(%esp) + jnz .L025fast_dec_loop + movl 52(%esp),%edi + movl 48(%esp),%esi + movl (%edi),%eax + movl 4(%edi),%ebx + movl 8(%edi),%ecx + movl 12(%edi),%edx + movl %eax,(%esi) + movl %ebx,4(%esi) + movl %ecx,8(%esi) + movl %edx,12(%esi) + jmp .L026fast_dec_out +.align 16 +.L024fast_dec_in_place: +.L027fast_dec_in_place_loop: + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edx + leal 60(%esp),%edi + movl %eax,(%edi) + movl %ebx,4(%edi) + movl %ecx,8(%edi) + movl %edx,12(%edi) + movl 44(%esp),%edi + call _x86_AES_decrypt + movl 48(%esp),%edi + movl 36(%esp),%esi + xorl (%edi),%eax + xorl 4(%edi),%ebx + xorl 8(%edi),%ecx + xorl 12(%edi),%edx + movl %eax,(%esi) + movl %ebx,4(%esi) + movl %ecx,8(%esi) + movl %edx,12(%esi) + leal 16(%esi),%esi + movl %esi,36(%esp) + leal 60(%esp),%esi + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edx + movl %eax,(%edi) + movl %ebx,4(%edi) + movl %ecx,8(%edi) + movl %edx,12(%edi) + movl 32(%esp),%esi + movl 40(%esp),%ecx + leal 16(%esi),%esi + movl %esi,32(%esp) + subl $16,%ecx + movl %ecx,40(%esp) + jnz .L027fast_dec_in_place_loop +.align 4 +.L026fast_dec_out: + cmpl $0,316(%esp) + movl 44(%esp),%edi + je .L028skip_dzero + movl $60,%ecx + xorl %eax,%eax +.align 4 +.long 2884892297 +.L028skip_dzero: + movl 28(%esp),%esp + popfl + popl %edi + popl %esi + popl %ebx + popl %ebp + ret + pushfl +.align 16 +.L015slow_way: + movl (%eax),%eax + movl 36(%esp),%edi + leal -80(%esp),%esi + andl $-64,%esi + leal -143(%edi),%ebx + subl %esi,%ebx + negl %ebx + andl $960,%ebx + subl %ebx,%esi + leal 768(%esi),%ebx + subl %ebp,%ebx + andl $768,%ebx + leal 2176(%ebp,%ebx,1),%ebp + leal 24(%esp),%edx + xchgl %esi,%esp + addl $4,%esp + movl %ebp,24(%esp) + movl %esi,28(%esp) + movl %eax,52(%esp) + movl (%edx),%eax + movl 4(%edx),%ebx + movl 16(%edx),%esi + movl 20(%edx),%edx + movl %eax,32(%esp) + movl %ebx,36(%esp) + movl %ecx,40(%esp) + movl %edi,44(%esp) + movl %esi,48(%esp) + movl %esi,%edi + movl %eax,%esi + cmpl $0,%edx + je .L029slow_decrypt + cmpl $16,%ecx + movl %ebx,%edx + jb .L030slow_enc_tail + btl $25,52(%esp) + jnc .L031slow_enc_x86 + movq (%edi),%mm0 + movq 8(%edi),%mm4 +.align 16 +.L032slow_enc_loop_sse: + pxor (%esi),%mm0 + pxor 8(%esi),%mm4 + movl 44(%esp),%edi + call _sse_AES_encrypt_compact + movl 32(%esp),%esi + movl 36(%esp),%edi + movl 40(%esp),%ecx + movq %mm0,(%edi) + movq %mm4,8(%edi) + leal 16(%esi),%esi + movl %esi,32(%esp) + leal 16(%edi),%edx + movl %edx,36(%esp) + subl $16,%ecx + cmpl $16,%ecx + movl %ecx,40(%esp) + jae .L032slow_enc_loop_sse + testl $15,%ecx + jnz .L030slow_enc_tail + movl 48(%esp),%esi + movq %mm0,(%esi) + movq %mm4,8(%esi) + emms + movl 28(%esp),%esp + popfl + popl %edi + popl %esi + popl %ebx + popl %ebp + ret + pushfl +.align 16 +.L031slow_enc_x86: + movl (%edi),%eax + movl 4(%edi),%ebx +.align 4 +.L033slow_enc_loop_x86: + movl 8(%edi),%ecx + movl 12(%edi),%edx + xorl (%esi),%eax + xorl 4(%esi),%ebx + xorl 8(%esi),%ecx + xorl 12(%esi),%edx + movl 44(%esp),%edi + call _x86_AES_encrypt_compact + movl 32(%esp),%esi + movl 36(%esp),%edi + movl %eax,(%edi) + movl %ebx,4(%edi) + movl %ecx,8(%edi) + movl %edx,12(%edi) + movl 40(%esp),%ecx + leal 16(%esi),%esi + movl %esi,32(%esp) + leal 16(%edi),%edx + movl %edx,36(%esp) + subl $16,%ecx + cmpl $16,%ecx + movl %ecx,40(%esp) + jae .L033slow_enc_loop_x86 + testl $15,%ecx + jnz .L030slow_enc_tail + movl 48(%esp),%esi + movl 8(%edi),%ecx + movl 12(%edi),%edx + movl %eax,(%esi) + movl %ebx,4(%esi) + movl %ecx,8(%esi) + movl %edx,12(%esi) + movl 28(%esp),%esp + popfl + popl %edi + popl %esi + popl %ebx + popl %ebp + ret + pushfl +.align 16 +.L030slow_enc_tail: + emms + movl %edx,%edi + movl $16,%ebx + subl %ecx,%ebx + cmpl %esi,%edi + je .L034enc_in_place +.align 4 +.long 2767451785 + jmp .L035enc_skip_in_place +.L034enc_in_place: + leal (%edi,%ecx,1),%edi +.L035enc_skip_in_place: + movl %ebx,%ecx + xorl %eax,%eax +.align 4 +.long 2868115081 + movl 48(%esp),%edi + movl %edx,%esi + movl (%edi),%eax + movl 4(%edi),%ebx + movl $16,40(%esp) + jmp .L033slow_enc_loop_x86 +.align 16 +.L029slow_decrypt: + btl $25,52(%esp) + jnc .L036slow_dec_loop_x86 +.align 4 +.L037slow_dec_loop_sse: + movq (%esi),%mm0 + movq 8(%esi),%mm4 + movl 44(%esp),%edi + call _sse_AES_decrypt_compact + movl 32(%esp),%esi + leal 60(%esp),%eax + movl 36(%esp),%ebx + movl 40(%esp),%ecx + movl 48(%esp),%edi + movq (%esi),%mm1 + movq 8(%esi),%mm5 + pxor (%edi),%mm0 + pxor 8(%edi),%mm4 + movq %mm1,(%edi) + movq %mm5,8(%edi) + subl $16,%ecx + jc .L038slow_dec_partial_sse + movq %mm0,(%ebx) + movq %mm4,8(%ebx) + leal 16(%ebx),%ebx + movl %ebx,36(%esp) + leal 16(%esi),%esi + movl %esi,32(%esp) + movl %ecx,40(%esp) + jnz .L037slow_dec_loop_sse + emms + movl 28(%esp),%esp + popfl + popl %edi + popl %esi + popl %ebx + popl %ebp + ret + pushfl +.align 16 +.L038slow_dec_partial_sse: + movq %mm0,(%eax) + movq %mm4,8(%eax) + emms + addl $16,%ecx + movl %ebx,%edi + movl %eax,%esi +.align 4 +.long 2767451785 + movl 28(%esp),%esp + popfl + popl %edi + popl %esi + popl %ebx + popl %ebp + ret + pushfl +.align 16 +.L036slow_dec_loop_x86: + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edx + leal 60(%esp),%edi + movl %eax,(%edi) + movl %ebx,4(%edi) + movl %ecx,8(%edi) + movl %edx,12(%edi) + movl 44(%esp),%edi + call _x86_AES_decrypt_compact + movl 48(%esp),%edi + movl 40(%esp),%esi + xorl (%edi),%eax + xorl 4(%edi),%ebx + xorl 8(%edi),%ecx + xorl 12(%edi),%edx + subl $16,%esi + jc .L039slow_dec_partial_x86 + movl %esi,40(%esp) + movl 36(%esp),%esi + movl %eax,(%esi) + movl %ebx,4(%esi) + movl %ecx,8(%esi) + movl %edx,12(%esi) + leal 16(%esi),%esi + movl %esi,36(%esp) + leal 60(%esp),%esi + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edx + movl %eax,(%edi) + movl %ebx,4(%edi) + movl %ecx,8(%edi) + movl %edx,12(%edi) + movl 32(%esp),%esi + leal 16(%esi),%esi + movl %esi,32(%esp) + jnz .L036slow_dec_loop_x86 + movl 28(%esp),%esp + popfl + popl %edi + popl %esi + popl %ebx + popl %ebp + ret + pushfl +.align 16 +.L039slow_dec_partial_x86: + leal 60(%esp),%esi + movl %eax,(%esi) + movl %ebx,4(%esi) + movl %ecx,8(%esi) + movl %edx,12(%esi) + movl 32(%esp),%esi + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edx + movl %eax,(%edi) + movl %ebx,4(%edi) + movl %ecx,8(%edi) + movl %edx,12(%edi) + movl 40(%esp),%ecx + movl 36(%esp),%edi + leal 60(%esp),%esi +.align 4 +.long 2767451785 + movl 28(%esp),%esp + popfl + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size AES_cbc_encrypt,.-.L_AES_cbc_encrypt_begin +.type _x86_AES_set_encrypt_key,@function +.align 16 +_x86_AES_set_encrypt_key: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 24(%esp),%esi + movl 32(%esp),%edi + testl $-1,%esi + jz .L040badpointer + testl $-1,%edi + jz .L040badpointer + call .L041pic_point +.L041pic_point: + popl %ebp + leal .LAES_Te-.L041pic_point(%ebp),%ebp + leal 2176(%ebp),%ebp + movl -128(%ebp),%eax + movl -96(%ebp),%ebx + movl -64(%ebp),%ecx + movl -32(%ebp),%edx + movl (%ebp),%eax + movl 32(%ebp),%ebx + movl 64(%ebp),%ecx + movl 96(%ebp),%edx + movl 28(%esp),%ecx + cmpl $128,%ecx + je .L04210rounds + cmpl $192,%ecx + je .L04312rounds + cmpl $256,%ecx + je .L04414rounds + movl $-2,%eax + jmp .L045exit +.L04210rounds: + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edx + movl %eax,(%edi) + movl %ebx,4(%edi) + movl %ecx,8(%edi) + movl %edx,12(%edi) + xorl %ecx,%ecx + jmp .L04610shortcut +.align 4 +.L04710loop: + movl (%edi),%eax + movl 12(%edi),%edx +.L04610shortcut: + movzbl %dl,%esi + movzbl -128(%ebp,%esi,1),%ebx + movzbl %dh,%esi + shll $24,%ebx + xorl %ebx,%eax + movzbl -128(%ebp,%esi,1),%ebx + shrl $16,%edx + movzbl %dl,%esi + xorl %ebx,%eax + movzbl -128(%ebp,%esi,1),%ebx + movzbl %dh,%esi + shll $8,%ebx + xorl %ebx,%eax + movzbl -128(%ebp,%esi,1),%ebx + shll $16,%ebx + xorl %ebx,%eax + xorl 896(%ebp,%ecx,4),%eax + movl %eax,16(%edi) + xorl 4(%edi),%eax + movl %eax,20(%edi) + xorl 8(%edi),%eax + movl %eax,24(%edi) + xorl 12(%edi),%eax + movl %eax,28(%edi) + incl %ecx + addl $16,%edi + cmpl $10,%ecx + jl .L04710loop + movl $10,80(%edi) + xorl %eax,%eax + jmp .L045exit +.L04312rounds: + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edx + movl %eax,(%edi) + movl %ebx,4(%edi) + movl %ecx,8(%edi) + movl %edx,12(%edi) + movl 16(%esi),%ecx + movl 20(%esi),%edx + movl %ecx,16(%edi) + movl %edx,20(%edi) + xorl %ecx,%ecx + jmp .L04812shortcut +.align 4 +.L04912loop: + movl (%edi),%eax + movl 20(%edi),%edx +.L04812shortcut: + movzbl %dl,%esi + movzbl -128(%ebp,%esi,1),%ebx + movzbl %dh,%esi + shll $24,%ebx + xorl %ebx,%eax + movzbl -128(%ebp,%esi,1),%ebx + shrl $16,%edx + movzbl %dl,%esi + xorl %ebx,%eax + movzbl -128(%ebp,%esi,1),%ebx + movzbl %dh,%esi + shll $8,%ebx + xorl %ebx,%eax + movzbl -128(%ebp,%esi,1),%ebx + shll $16,%ebx + xorl %ebx,%eax + xorl 896(%ebp,%ecx,4),%eax + movl %eax,24(%edi) + xorl 4(%edi),%eax + movl %eax,28(%edi) + xorl 8(%edi),%eax + movl %eax,32(%edi) + xorl 12(%edi),%eax + movl %eax,36(%edi) + cmpl $7,%ecx + je .L05012break + incl %ecx + xorl 16(%edi),%eax + movl %eax,40(%edi) + xorl 20(%edi),%eax + movl %eax,44(%edi) + addl $24,%edi + jmp .L04912loop +.L05012break: + movl $12,72(%edi) + xorl %eax,%eax + jmp .L045exit +.L04414rounds: + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edx + movl %eax,(%edi) + movl %ebx,4(%edi) + movl %ecx,8(%edi) + movl %edx,12(%edi) + movl 16(%esi),%eax + movl 20(%esi),%ebx + movl 24(%esi),%ecx + movl 28(%esi),%edx + movl %eax,16(%edi) + movl %ebx,20(%edi) + movl %ecx,24(%edi) + movl %edx,28(%edi) + xorl %ecx,%ecx + jmp .L05114shortcut +.align 4 +.L05214loop: + movl 28(%edi),%edx +.L05114shortcut: + movl (%edi),%eax + movzbl %dl,%esi + movzbl -128(%ebp,%esi,1),%ebx + movzbl %dh,%esi + shll $24,%ebx + xorl %ebx,%eax + movzbl -128(%ebp,%esi,1),%ebx + shrl $16,%edx + movzbl %dl,%esi + xorl %ebx,%eax + movzbl -128(%ebp,%esi,1),%ebx + movzbl %dh,%esi + shll $8,%ebx + xorl %ebx,%eax + movzbl -128(%ebp,%esi,1),%ebx + shll $16,%ebx + xorl %ebx,%eax + xorl 896(%ebp,%ecx,4),%eax + movl %eax,32(%edi) + xorl 4(%edi),%eax + movl %eax,36(%edi) + xorl 8(%edi),%eax + movl %eax,40(%edi) + xorl 12(%edi),%eax + movl %eax,44(%edi) + cmpl $6,%ecx + je .L05314break + incl %ecx + movl %eax,%edx + movl 16(%edi),%eax + movzbl %dl,%esi + movzbl -128(%ebp,%esi,1),%ebx + movzbl %dh,%esi + xorl %ebx,%eax + movzbl -128(%ebp,%esi,1),%ebx + shrl $16,%edx + shll $8,%ebx + movzbl %dl,%esi + xorl %ebx,%eax + movzbl -128(%ebp,%esi,1),%ebx + movzbl %dh,%esi + shll $16,%ebx + xorl %ebx,%eax + movzbl -128(%ebp,%esi,1),%ebx + shll $24,%ebx + xorl %ebx,%eax + movl %eax,48(%edi) + xorl 20(%edi),%eax + movl %eax,52(%edi) + xorl 24(%edi),%eax + movl %eax,56(%edi) + xorl 28(%edi),%eax + movl %eax,60(%edi) + addl $32,%edi + jmp .L05214loop +.L05314break: + movl $14,48(%edi) + xorl %eax,%eax + jmp .L045exit +.L040badpointer: + movl $-1,%eax +.L045exit: + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size _x86_AES_set_encrypt_key,.-_x86_AES_set_encrypt_key +.globl AES_set_encrypt_key +.type AES_set_encrypt_key,@function +.align 16 +AES_set_encrypt_key: +.L_AES_set_encrypt_key_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + + call _x86_AES_set_encrypt_key + ret +.size AES_set_encrypt_key,.-.L_AES_set_encrypt_key_begin +.globl AES_set_decrypt_key +.type AES_set_decrypt_key,@function +.align 16 +AES_set_decrypt_key: +.L_AES_set_decrypt_key_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + + call _x86_AES_set_encrypt_key + cmpl $0,%eax + je .L054proceed + ret +.L054proceed: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 28(%esp),%esi + movl 240(%esi),%ecx + leal (,%ecx,4),%ecx + leal (%esi,%ecx,4),%edi +.align 4 +.L055invert: + movl (%esi),%eax + movl 4(%esi),%ebx + movl (%edi),%ecx + movl 4(%edi),%edx + movl %eax,(%edi) + movl %ebx,4(%edi) + movl %ecx,(%esi) + movl %edx,4(%esi) + movl 8(%esi),%eax + movl 12(%esi),%ebx + movl 8(%edi),%ecx + movl 12(%edi),%edx + movl %eax,8(%edi) + movl %ebx,12(%edi) + movl %ecx,8(%esi) + movl %edx,12(%esi) + addl $16,%esi + subl $16,%edi + cmpl %edi,%esi + jne .L055invert + movl 28(%esp),%edi + movl 240(%edi),%esi + leal -2(%esi,%esi,1),%esi + leal (%edi,%esi,8),%esi + movl %esi,28(%esp) + movl 16(%edi),%eax +.align 4 +.L056permute: + addl $16,%edi + movl $2155905152,%ebp + andl %eax,%ebp + leal (%eax,%eax,1),%ebx + movl %ebp,%esi + shrl $7,%ebp + subl %ebp,%esi + andl $4278124286,%ebx + andl $454761243,%esi + xorl %esi,%ebx + movl $2155905152,%ebp + andl %ebx,%ebp + leal (%ebx,%ebx,1),%ecx + movl %ebp,%esi + shrl $7,%ebp + subl %ebp,%esi + andl $4278124286,%ecx + andl $454761243,%esi + xorl %eax,%ebx + xorl %esi,%ecx + movl $2155905152,%ebp + andl %ecx,%ebp + leal (%ecx,%ecx,1),%edx + movl %ebp,%esi + shrl $7,%ebp + xorl %eax,%ecx + subl %ebp,%esi + andl $4278124286,%edx + andl $454761243,%esi + roll $8,%eax + xorl %esi,%edx + movl 4(%edi),%ebp + xorl %ebx,%eax + xorl %edx,%ebx + xorl %ecx,%eax + roll $24,%ebx + xorl %edx,%ecx + xorl %edx,%eax + roll $16,%ecx + xorl %ebx,%eax + roll $8,%edx + xorl %ecx,%eax + movl %ebp,%ebx + xorl %edx,%eax + movl %eax,(%edi) + movl $2155905152,%ebp + andl %ebx,%ebp + leal (%ebx,%ebx,1),%ecx + movl %ebp,%esi + shrl $7,%ebp + subl %ebp,%esi + andl $4278124286,%ecx + andl $454761243,%esi + xorl %esi,%ecx + movl $2155905152,%ebp + andl %ecx,%ebp + leal (%ecx,%ecx,1),%edx + movl %ebp,%esi + shrl $7,%ebp + subl %ebp,%esi + andl $4278124286,%edx + andl $454761243,%esi + xorl %ebx,%ecx + xorl %esi,%edx + movl $2155905152,%ebp + andl %edx,%ebp + leal (%edx,%edx,1),%eax + movl %ebp,%esi + shrl $7,%ebp + xorl %ebx,%edx + subl %ebp,%esi + andl $4278124286,%eax + andl $454761243,%esi + roll $8,%ebx + xorl %esi,%eax + movl 8(%edi),%ebp + xorl %ecx,%ebx + xorl %eax,%ecx + xorl %edx,%ebx + roll $24,%ecx + xorl %eax,%edx + xorl %eax,%ebx + roll $16,%edx + xorl %ecx,%ebx + roll $8,%eax + xorl %edx,%ebx + movl %ebp,%ecx + xorl %eax,%ebx + movl %ebx,4(%edi) + movl $2155905152,%ebp + andl %ecx,%ebp + leal (%ecx,%ecx,1),%edx + movl %ebp,%esi + shrl $7,%ebp + subl %ebp,%esi + andl $4278124286,%edx + andl $454761243,%esi + xorl %esi,%edx + movl $2155905152,%ebp + andl %edx,%ebp + leal (%edx,%edx,1),%eax + movl %ebp,%esi + shrl $7,%ebp + subl %ebp,%esi + andl $4278124286,%eax + andl $454761243,%esi + xorl %ecx,%edx + xorl %esi,%eax + movl $2155905152,%ebp + andl %eax,%ebp + leal (%eax,%eax,1),%ebx + movl %ebp,%esi + shrl $7,%ebp + xorl %ecx,%eax + subl %ebp,%esi + andl $4278124286,%ebx + andl $454761243,%esi + roll $8,%ecx + xorl %esi,%ebx + movl 12(%edi),%ebp + xorl %edx,%ecx + xorl %ebx,%edx + xorl %eax,%ecx + roll $24,%edx + xorl %ebx,%eax + xorl %ebx,%ecx + roll $16,%eax + xorl %edx,%ecx + roll $8,%ebx + xorl %eax,%ecx + movl %ebp,%edx + xorl %ebx,%ecx + movl %ecx,8(%edi) + movl $2155905152,%ebp + andl %edx,%ebp + leal (%edx,%edx,1),%eax + movl %ebp,%esi + shrl $7,%ebp + subl %ebp,%esi + andl $4278124286,%eax + andl $454761243,%esi + xorl %esi,%eax + movl $2155905152,%ebp + andl %eax,%ebp + leal (%eax,%eax,1),%ebx + movl %ebp,%esi + shrl $7,%ebp + subl %ebp,%esi + andl $4278124286,%ebx + andl $454761243,%esi + xorl %edx,%eax + xorl %esi,%ebx + movl $2155905152,%ebp + andl %ebx,%ebp + leal (%ebx,%ebx,1),%ecx + movl %ebp,%esi + shrl $7,%ebp + xorl %edx,%ebx + subl %ebp,%esi + andl $4278124286,%ecx + andl $454761243,%esi + roll $8,%edx + xorl %esi,%ecx + movl 16(%edi),%ebp + xorl %eax,%edx + xorl %ecx,%eax + xorl %ebx,%edx + roll $24,%eax + xorl %ecx,%ebx + xorl %ecx,%edx + roll $16,%ebx + xorl %eax,%edx + roll $8,%ecx + xorl %ebx,%edx + movl %ebp,%eax + xorl %ecx,%edx + movl %edx,12(%edi) + cmpl 28(%esp),%edi + jb .L056permute + xorl %eax,%eax + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size AES_set_decrypt_key,.-.L_AES_set_decrypt_key_begin +.byte 65,69,83,32,102,111,114,32,120,56,54,44,32,67,82,89 +.byte 80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114 +.byte 111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.comm OPENSSL_ia32cap_P,16,4 + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: +#else +.text +.type _x86_AES_encrypt_compact,@function +.align 16 +_x86_AES_encrypt_compact: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + + movl %edi,20(%esp) + xorl (%edi),%eax + xorl 4(%edi),%ebx + xorl 8(%edi),%ecx + xorl 12(%edi),%edx + movl 240(%edi),%esi + leal -2(%esi,%esi,1),%esi + leal (%edi,%esi,8),%esi + movl %esi,24(%esp) + movl -128(%ebp),%edi + movl -96(%ebp),%esi + movl -64(%ebp),%edi + movl -32(%ebp),%esi + movl (%ebp),%edi + movl 32(%ebp),%esi + movl 64(%ebp),%edi + movl 96(%ebp),%esi +.align 16 +.L000loop: + movl %eax,%esi + andl $255,%esi + movzbl -128(%ebp,%esi,1),%esi + movzbl %bh,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $8,%edi + xorl %edi,%esi + movl %ecx,%edi + shrl $16,%edi + andl $255,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $16,%edi + xorl %edi,%esi + movl %edx,%edi + shrl $24,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $24,%edi + xorl %edi,%esi + movl %esi,4(%esp) + + movl %ebx,%esi + andl $255,%esi + shrl $16,%ebx + movzbl -128(%ebp,%esi,1),%esi + movzbl %ch,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $8,%edi + xorl %edi,%esi + movl %edx,%edi + shrl $16,%edi + andl $255,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $16,%edi + xorl %edi,%esi + movl %eax,%edi + shrl $24,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $24,%edi + xorl %edi,%esi + movl %esi,8(%esp) + + movl %ecx,%esi + andl $255,%esi + shrl $24,%ecx + movzbl -128(%ebp,%esi,1),%esi + movzbl %dh,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $8,%edi + xorl %edi,%esi + movl %eax,%edi + shrl $16,%edi + andl $255,%edx + andl $255,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $16,%edi + xorl %edi,%esi + movzbl %bh,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $24,%edi + xorl %edi,%esi + + andl $255,%edx + movzbl -128(%ebp,%edx,1),%edx + movzbl %ah,%eax + movzbl -128(%ebp,%eax,1),%eax + shll $8,%eax + xorl %eax,%edx + movl 4(%esp),%eax + andl $255,%ebx + movzbl -128(%ebp,%ebx,1),%ebx + shll $16,%ebx + xorl %ebx,%edx + movl 8(%esp),%ebx + movzbl -128(%ebp,%ecx,1),%ecx + shll $24,%ecx + xorl %ecx,%edx + movl %esi,%ecx + + movl $2155905152,%ebp + andl %ecx,%ebp + leal (%ecx,%ecx,1),%edi + movl %ebp,%esi + shrl $7,%ebp + andl $4278124286,%edi + subl %ebp,%esi + movl %ecx,%ebp + andl $454761243,%esi + rorl $16,%ebp + xorl %edi,%esi + movl %ecx,%edi + xorl %esi,%ecx + rorl $24,%edi + xorl %ebp,%esi + roll $24,%ecx + xorl %edi,%esi + movl $2155905152,%ebp + xorl %esi,%ecx + andl %edx,%ebp + leal (%edx,%edx,1),%edi + movl %ebp,%esi + shrl $7,%ebp + andl $4278124286,%edi + subl %ebp,%esi + movl %edx,%ebp + andl $454761243,%esi + rorl $16,%ebp + xorl %edi,%esi + movl %edx,%edi + xorl %esi,%edx + rorl $24,%edi + xorl %ebp,%esi + roll $24,%edx + xorl %edi,%esi + movl $2155905152,%ebp + xorl %esi,%edx + andl %eax,%ebp + leal (%eax,%eax,1),%edi + movl %ebp,%esi + shrl $7,%ebp + andl $4278124286,%edi + subl %ebp,%esi + movl %eax,%ebp + andl $454761243,%esi + rorl $16,%ebp + xorl %edi,%esi + movl %eax,%edi + xorl %esi,%eax + rorl $24,%edi + xorl %ebp,%esi + roll $24,%eax + xorl %edi,%esi + movl $2155905152,%ebp + xorl %esi,%eax + andl %ebx,%ebp + leal (%ebx,%ebx,1),%edi + movl %ebp,%esi + shrl $7,%ebp + andl $4278124286,%edi + subl %ebp,%esi + movl %ebx,%ebp + andl $454761243,%esi + rorl $16,%ebp + xorl %edi,%esi + movl %ebx,%edi + xorl %esi,%ebx + rorl $24,%edi + xorl %ebp,%esi + roll $24,%ebx + xorl %edi,%esi + xorl %esi,%ebx + movl 20(%esp),%edi + movl 28(%esp),%ebp + addl $16,%edi + xorl (%edi),%eax + xorl 4(%edi),%ebx + xorl 8(%edi),%ecx + xorl 12(%edi),%edx + cmpl 24(%esp),%edi + movl %edi,20(%esp) + jb .L000loop + movl %eax,%esi + andl $255,%esi + movzbl -128(%ebp,%esi,1),%esi + movzbl %bh,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $8,%edi + xorl %edi,%esi + movl %ecx,%edi + shrl $16,%edi + andl $255,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $16,%edi + xorl %edi,%esi + movl %edx,%edi + shrl $24,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $24,%edi + xorl %edi,%esi + movl %esi,4(%esp) + + movl %ebx,%esi + andl $255,%esi + shrl $16,%ebx + movzbl -128(%ebp,%esi,1),%esi + movzbl %ch,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $8,%edi + xorl %edi,%esi + movl %edx,%edi + shrl $16,%edi + andl $255,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $16,%edi + xorl %edi,%esi + movl %eax,%edi + shrl $24,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $24,%edi + xorl %edi,%esi + movl %esi,8(%esp) + + movl %ecx,%esi + andl $255,%esi + shrl $24,%ecx + movzbl -128(%ebp,%esi,1),%esi + movzbl %dh,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $8,%edi + xorl %edi,%esi + movl %eax,%edi + shrl $16,%edi + andl $255,%edx + andl $255,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $16,%edi + xorl %edi,%esi + movzbl %bh,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $24,%edi + xorl %edi,%esi + + movl 20(%esp),%edi + andl $255,%edx + movzbl -128(%ebp,%edx,1),%edx + movzbl %ah,%eax + movzbl -128(%ebp,%eax,1),%eax + shll $8,%eax + xorl %eax,%edx + movl 4(%esp),%eax + andl $255,%ebx + movzbl -128(%ebp,%ebx,1),%ebx + shll $16,%ebx + xorl %ebx,%edx + movl 8(%esp),%ebx + movzbl -128(%ebp,%ecx,1),%ecx + shll $24,%ecx + xorl %ecx,%edx + movl %esi,%ecx + + xorl 16(%edi),%eax + xorl 20(%edi),%ebx + xorl 24(%edi),%ecx + xorl 28(%edi),%edx + ret +.size _x86_AES_encrypt_compact,.-_x86_AES_encrypt_compact +.type _sse_AES_encrypt_compact,@function +.align 16 +_sse_AES_encrypt_compact: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + + pxor (%edi),%mm0 + pxor 8(%edi),%mm4 + movl 240(%edi),%esi + leal -2(%esi,%esi,1),%esi + leal (%edi,%esi,8),%esi + movl %esi,24(%esp) + movl $454761243,%eax + movl %eax,8(%esp) + movl %eax,12(%esp) + movl -128(%ebp),%eax + movl -96(%ebp),%ebx + movl -64(%ebp),%ecx + movl -32(%ebp),%edx + movl (%ebp),%eax + movl 32(%ebp),%ebx + movl 64(%ebp),%ecx + movl 96(%ebp),%edx +.align 16 +.L001loop: + pshufw $8,%mm0,%mm1 + pshufw $13,%mm4,%mm5 + movd %mm1,%eax + movd %mm5,%ebx + movl %edi,20(%esp) + movzbl %al,%esi + movzbl %ah,%edx + pshufw $13,%mm0,%mm2 + movzbl -128(%ebp,%esi,1),%ecx + movzbl %bl,%edi + movzbl -128(%ebp,%edx,1),%edx + shrl $16,%eax + shll $8,%edx + movzbl -128(%ebp,%edi,1),%esi + movzbl %bh,%edi + shll $16,%esi + pshufw $8,%mm4,%mm6 + orl %esi,%ecx + movzbl -128(%ebp,%edi,1),%esi + movzbl %ah,%edi + shll $24,%esi + shrl $16,%ebx + orl %esi,%edx + movzbl -128(%ebp,%edi,1),%esi + movzbl %bh,%edi + shll $8,%esi + orl %esi,%ecx + movzbl -128(%ebp,%edi,1),%esi + movzbl %al,%edi + shll $24,%esi + orl %esi,%ecx + movzbl -128(%ebp,%edi,1),%esi + movzbl %bl,%edi + movd %mm2,%eax + movd %ecx,%mm0 + movzbl -128(%ebp,%edi,1),%ecx + movzbl %ah,%edi + shll $16,%ecx + movd %mm6,%ebx + orl %esi,%ecx + movzbl -128(%ebp,%edi,1),%esi + movzbl %bh,%edi + shll $24,%esi + orl %esi,%ecx + movzbl -128(%ebp,%edi,1),%esi + movzbl %bl,%edi + shll $8,%esi + shrl $16,%ebx + orl %esi,%ecx + movzbl -128(%ebp,%edi,1),%esi + movzbl %al,%edi + shrl $16,%eax + movd %ecx,%mm1 + movzbl -128(%ebp,%edi,1),%ecx + movzbl %ah,%edi + shll $16,%ecx + andl $255,%eax + orl %esi,%ecx + punpckldq %mm1,%mm0 + movzbl -128(%ebp,%edi,1),%esi + movzbl %bh,%edi + shll $24,%esi + andl $255,%ebx + movzbl -128(%ebp,%eax,1),%eax + orl %esi,%ecx + shll $16,%eax + movzbl -128(%ebp,%edi,1),%esi + orl %eax,%edx + shll $8,%esi + movzbl -128(%ebp,%ebx,1),%ebx + orl %esi,%ecx + orl %ebx,%edx + movl 20(%esp),%edi + movd %ecx,%mm4 + movd %edx,%mm5 + punpckldq %mm5,%mm4 + addl $16,%edi + cmpl 24(%esp),%edi + ja .L002out + movq 8(%esp),%mm2 + pxor %mm3,%mm3 + pxor %mm7,%mm7 + movq %mm0,%mm1 + movq %mm4,%mm5 + pcmpgtb %mm0,%mm3 + pcmpgtb %mm4,%mm7 + pand %mm2,%mm3 + pand %mm2,%mm7 + pshufw $177,%mm0,%mm2 + pshufw $177,%mm4,%mm6 + paddb %mm0,%mm0 + paddb %mm4,%mm4 + pxor %mm3,%mm0 + pxor %mm7,%mm4 + pshufw $177,%mm2,%mm3 + pshufw $177,%mm6,%mm7 + pxor %mm0,%mm1 + pxor %mm4,%mm5 + pxor %mm2,%mm0 + pxor %mm6,%mm4 + movq %mm3,%mm2 + movq %mm7,%mm6 + pslld $8,%mm3 + pslld $8,%mm7 + psrld $24,%mm2 + psrld $24,%mm6 + pxor %mm3,%mm0 + pxor %mm7,%mm4 + pxor %mm2,%mm0 + pxor %mm6,%mm4 + movq %mm1,%mm3 + movq %mm5,%mm7 + movq (%edi),%mm2 + movq 8(%edi),%mm6 + psrld $8,%mm1 + psrld $8,%mm5 + movl -128(%ebp),%eax + pslld $24,%mm3 + pslld $24,%mm7 + movl -64(%ebp),%ebx + pxor %mm1,%mm0 + pxor %mm5,%mm4 + movl (%ebp),%ecx + pxor %mm3,%mm0 + pxor %mm7,%mm4 + movl 64(%ebp),%edx + pxor %mm2,%mm0 + pxor %mm6,%mm4 + jmp .L001loop +.align 16 +.L002out: + pxor (%edi),%mm0 + pxor 8(%edi),%mm4 + ret +.size _sse_AES_encrypt_compact,.-_sse_AES_encrypt_compact +.type _x86_AES_encrypt,@function +.align 16 +_x86_AES_encrypt: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + + movl %edi,20(%esp) + xorl (%edi),%eax + xorl 4(%edi),%ebx + xorl 8(%edi),%ecx + xorl 12(%edi),%edx + movl 240(%edi),%esi + leal -2(%esi,%esi,1),%esi + leal (%edi,%esi,8),%esi + movl %esi,24(%esp) +.align 16 +.L003loop: + movl %eax,%esi + andl $255,%esi + movl (%ebp,%esi,8),%esi + movzbl %bh,%edi + xorl 3(%ebp,%edi,8),%esi + movl %ecx,%edi + shrl $16,%edi + andl $255,%edi + xorl 2(%ebp,%edi,8),%esi + movl %edx,%edi + shrl $24,%edi + xorl 1(%ebp,%edi,8),%esi + movl %esi,4(%esp) + + movl %ebx,%esi + andl $255,%esi + shrl $16,%ebx + movl (%ebp,%esi,8),%esi + movzbl %ch,%edi + xorl 3(%ebp,%edi,8),%esi + movl %edx,%edi + shrl $16,%edi + andl $255,%edi + xorl 2(%ebp,%edi,8),%esi + movl %eax,%edi + shrl $24,%edi + xorl 1(%ebp,%edi,8),%esi + movl %esi,8(%esp) + + movl %ecx,%esi + andl $255,%esi + shrl $24,%ecx + movl (%ebp,%esi,8),%esi + movzbl %dh,%edi + xorl 3(%ebp,%edi,8),%esi + movl %eax,%edi + shrl $16,%edi + andl $255,%edx + andl $255,%edi + xorl 2(%ebp,%edi,8),%esi + movzbl %bh,%edi + xorl 1(%ebp,%edi,8),%esi + + movl 20(%esp),%edi + movl (%ebp,%edx,8),%edx + movzbl %ah,%eax + xorl 3(%ebp,%eax,8),%edx + movl 4(%esp),%eax + andl $255,%ebx + xorl 2(%ebp,%ebx,8),%edx + movl 8(%esp),%ebx + xorl 1(%ebp,%ecx,8),%edx + movl %esi,%ecx + + addl $16,%edi + xorl (%edi),%eax + xorl 4(%edi),%ebx + xorl 8(%edi),%ecx + xorl 12(%edi),%edx + cmpl 24(%esp),%edi + movl %edi,20(%esp) + jb .L003loop + movl %eax,%esi + andl $255,%esi + movl 2(%ebp,%esi,8),%esi + andl $255,%esi + movzbl %bh,%edi + movl (%ebp,%edi,8),%edi + andl $65280,%edi + xorl %edi,%esi + movl %ecx,%edi + shrl $16,%edi + andl $255,%edi + movl (%ebp,%edi,8),%edi + andl $16711680,%edi + xorl %edi,%esi + movl %edx,%edi + shrl $24,%edi + movl 2(%ebp,%edi,8),%edi + andl $4278190080,%edi + xorl %edi,%esi + movl %esi,4(%esp) + movl %ebx,%esi + andl $255,%esi + shrl $16,%ebx + movl 2(%ebp,%esi,8),%esi + andl $255,%esi + movzbl %ch,%edi + movl (%ebp,%edi,8),%edi + andl $65280,%edi + xorl %edi,%esi + movl %edx,%edi + shrl $16,%edi + andl $255,%edi + movl (%ebp,%edi,8),%edi + andl $16711680,%edi + xorl %edi,%esi + movl %eax,%edi + shrl $24,%edi + movl 2(%ebp,%edi,8),%edi + andl $4278190080,%edi + xorl %edi,%esi + movl %esi,8(%esp) + movl %ecx,%esi + andl $255,%esi + shrl $24,%ecx + movl 2(%ebp,%esi,8),%esi + andl $255,%esi + movzbl %dh,%edi + movl (%ebp,%edi,8),%edi + andl $65280,%edi + xorl %edi,%esi + movl %eax,%edi + shrl $16,%edi + andl $255,%edx + andl $255,%edi + movl (%ebp,%edi,8),%edi + andl $16711680,%edi + xorl %edi,%esi + movzbl %bh,%edi + movl 2(%ebp,%edi,8),%edi + andl $4278190080,%edi + xorl %edi,%esi + movl 20(%esp),%edi + andl $255,%edx + movl 2(%ebp,%edx,8),%edx + andl $255,%edx + movzbl %ah,%eax + movl (%ebp,%eax,8),%eax + andl $65280,%eax + xorl %eax,%edx + movl 4(%esp),%eax + andl $255,%ebx + movl (%ebp,%ebx,8),%ebx + andl $16711680,%ebx + xorl %ebx,%edx + movl 8(%esp),%ebx + movl 2(%ebp,%ecx,8),%ecx + andl $4278190080,%ecx + xorl %ecx,%edx + movl %esi,%ecx + addl $16,%edi + xorl (%edi),%eax + xorl 4(%edi),%ebx + xorl 8(%edi),%ecx + xorl 12(%edi),%edx + ret +.align 64 +.LAES_Te: +.long 2774754246,2774754246 +.long 2222750968,2222750968 +.long 2574743534,2574743534 +.long 2373680118,2373680118 +.long 234025727,234025727 +.long 3177933782,3177933782 +.long 2976870366,2976870366 +.long 1422247313,1422247313 +.long 1345335392,1345335392 +.long 50397442,50397442 +.long 2842126286,2842126286 +.long 2099981142,2099981142 +.long 436141799,436141799 +.long 1658312629,1658312629 +.long 3870010189,3870010189 +.long 2591454956,2591454956 +.long 1170918031,1170918031 +.long 2642575903,2642575903 +.long 1086966153,1086966153 +.long 2273148410,2273148410 +.long 368769775,368769775 +.long 3948501426,3948501426 +.long 3376891790,3376891790 +.long 200339707,200339707 +.long 3970805057,3970805057 +.long 1742001331,1742001331 +.long 4255294047,4255294047 +.long 3937382213,3937382213 +.long 3214711843,3214711843 +.long 4154762323,4154762323 +.long 2524082916,2524082916 +.long 1539358875,1539358875 +.long 3266819957,3266819957 +.long 486407649,486407649 +.long 2928907069,2928907069 +.long 1780885068,1780885068 +.long 1513502316,1513502316 +.long 1094664062,1094664062 +.long 49805301,49805301 +.long 1338821763,1338821763 +.long 1546925160,1546925160 +.long 4104496465,4104496465 +.long 887481809,887481809 +.long 150073849,150073849 +.long 2473685474,2473685474 +.long 1943591083,1943591083 +.long 1395732834,1395732834 +.long 1058346282,1058346282 +.long 201589768,201589768 +.long 1388824469,1388824469 +.long 1696801606,1696801606 +.long 1589887901,1589887901 +.long 672667696,672667696 +.long 2711000631,2711000631 +.long 251987210,251987210 +.long 3046808111,3046808111 +.long 151455502,151455502 +.long 907153956,907153956 +.long 2608889883,2608889883 +.long 1038279391,1038279391 +.long 652995533,652995533 +.long 1764173646,1764173646 +.long 3451040383,3451040383 +.long 2675275242,2675275242 +.long 453576978,453576978 +.long 2659418909,2659418909 +.long 1949051992,1949051992 +.long 773462580,773462580 +.long 756751158,756751158 +.long 2993581788,2993581788 +.long 3998898868,3998898868 +.long 4221608027,4221608027 +.long 4132590244,4132590244 +.long 1295727478,1295727478 +.long 1641469623,1641469623 +.long 3467883389,3467883389 +.long 2066295122,2066295122 +.long 1055122397,1055122397 +.long 1898917726,1898917726 +.long 2542044179,2542044179 +.long 4115878822,4115878822 +.long 1758581177,1758581177 +.long 0,0 +.long 753790401,753790401 +.long 1612718144,1612718144 +.long 536673507,536673507 +.long 3367088505,3367088505 +.long 3982187446,3982187446 +.long 3194645204,3194645204 +.long 1187761037,1187761037 +.long 3653156455,3653156455 +.long 1262041458,1262041458 +.long 3729410708,3729410708 +.long 3561770136,3561770136 +.long 3898103984,3898103984 +.long 1255133061,1255133061 +.long 1808847035,1808847035 +.long 720367557,720367557 +.long 3853167183,3853167183 +.long 385612781,385612781 +.long 3309519750,3309519750 +.long 3612167578,3612167578 +.long 1429418854,1429418854 +.long 2491778321,2491778321 +.long 3477423498,3477423498 +.long 284817897,284817897 +.long 100794884,100794884 +.long 2172616702,2172616702 +.long 4031795360,4031795360 +.long 1144798328,1144798328 +.long 3131023141,3131023141 +.long 3819481163,3819481163 +.long 4082192802,4082192802 +.long 4272137053,4272137053 +.long 3225436288,3225436288 +.long 2324664069,2324664069 +.long 2912064063,2912064063 +.long 3164445985,3164445985 +.long 1211644016,1211644016 +.long 83228145,83228145 +.long 3753688163,3753688163 +.long 3249976951,3249976951 +.long 1977277103,1977277103 +.long 1663115586,1663115586 +.long 806359072,806359072 +.long 452984805,452984805 +.long 250868733,250868733 +.long 1842533055,1842533055 +.long 1288555905,1288555905 +.long 336333848,336333848 +.long 890442534,890442534 +.long 804056259,804056259 +.long 3781124030,3781124030 +.long 2727843637,2727843637 +.long 3427026056,3427026056 +.long 957814574,957814574 +.long 1472513171,1472513171 +.long 4071073621,4071073621 +.long 2189328124,2189328124 +.long 1195195770,1195195770 +.long 2892260552,2892260552 +.long 3881655738,3881655738 +.long 723065138,723065138 +.long 2507371494,2507371494 +.long 2690670784,2690670784 +.long 2558624025,2558624025 +.long 3511635870,3511635870 +.long 2145180835,2145180835 +.long 1713513028,1713513028 +.long 2116692564,2116692564 +.long 2878378043,2878378043 +.long 2206763019,2206763019 +.long 3393603212,3393603212 +.long 703524551,703524551 +.long 3552098411,3552098411 +.long 1007948840,1007948840 +.long 2044649127,2044649127 +.long 3797835452,3797835452 +.long 487262998,487262998 +.long 1994120109,1994120109 +.long 1004593371,1004593371 +.long 1446130276,1446130276 +.long 1312438900,1312438900 +.long 503974420,503974420 +.long 3679013266,3679013266 +.long 168166924,168166924 +.long 1814307912,1814307912 +.long 3831258296,3831258296 +.long 1573044895,1573044895 +.long 1859376061,1859376061 +.long 4021070915,4021070915 +.long 2791465668,2791465668 +.long 2828112185,2828112185 +.long 2761266481,2761266481 +.long 937747667,937747667 +.long 2339994098,2339994098 +.long 854058965,854058965 +.long 1137232011,1137232011 +.long 1496790894,1496790894 +.long 3077402074,3077402074 +.long 2358086913,2358086913 +.long 1691735473,1691735473 +.long 3528347292,3528347292 +.long 3769215305,3769215305 +.long 3027004632,3027004632 +.long 4199962284,4199962284 +.long 133494003,133494003 +.long 636152527,636152527 +.long 2942657994,2942657994 +.long 2390391540,2390391540 +.long 3920539207,3920539207 +.long 403179536,403179536 +.long 3585784431,3585784431 +.long 2289596656,2289596656 +.long 1864705354,1864705354 +.long 1915629148,1915629148 +.long 605822008,605822008 +.long 4054230615,4054230615 +.long 3350508659,3350508659 +.long 1371981463,1371981463 +.long 602466507,602466507 +.long 2094914977,2094914977 +.long 2624877800,2624877800 +.long 555687742,555687742 +.long 3712699286,3712699286 +.long 3703422305,3703422305 +.long 2257292045,2257292045 +.long 2240449039,2240449039 +.long 2423288032,2423288032 +.long 1111375484,1111375484 +.long 3300242801,3300242801 +.long 2858837708,2858837708 +.long 3628615824,3628615824 +.long 84083462,84083462 +.long 32962295,32962295 +.long 302911004,302911004 +.long 2741068226,2741068226 +.long 1597322602,1597322602 +.long 4183250862,4183250862 +.long 3501832553,3501832553 +.long 2441512471,2441512471 +.long 1489093017,1489093017 +.long 656219450,656219450 +.long 3114180135,3114180135 +.long 954327513,954327513 +.long 335083755,335083755 +.long 3013122091,3013122091 +.long 856756514,856756514 +.long 3144247762,3144247762 +.long 1893325225,1893325225 +.long 2307821063,2307821063 +.long 2811532339,2811532339 +.long 3063651117,3063651117 +.long 572399164,572399164 +.long 2458355477,2458355477 +.long 552200649,552200649 +.long 1238290055,1238290055 +.long 4283782570,4283782570 +.long 2015897680,2015897680 +.long 2061492133,2061492133 +.long 2408352771,2408352771 +.long 4171342169,4171342169 +.long 2156497161,2156497161 +.long 386731290,386731290 +.long 3669999461,3669999461 +.long 837215959,837215959 +.long 3326231172,3326231172 +.long 3093850320,3093850320 +.long 3275833730,3275833730 +.long 2962856233,2962856233 +.long 1999449434,1999449434 +.long 286199582,286199582 +.long 3417354363,3417354363 +.long 4233385128,4233385128 +.long 3602627437,3602627437 +.long 974525996,974525996 +.byte 99,124,119,123,242,107,111,197 +.byte 48,1,103,43,254,215,171,118 +.byte 202,130,201,125,250,89,71,240 +.byte 173,212,162,175,156,164,114,192 +.byte 183,253,147,38,54,63,247,204 +.byte 52,165,229,241,113,216,49,21 +.byte 4,199,35,195,24,150,5,154 +.byte 7,18,128,226,235,39,178,117 +.byte 9,131,44,26,27,110,90,160 +.byte 82,59,214,179,41,227,47,132 +.byte 83,209,0,237,32,252,177,91 +.byte 106,203,190,57,74,76,88,207 +.byte 208,239,170,251,67,77,51,133 +.byte 69,249,2,127,80,60,159,168 +.byte 81,163,64,143,146,157,56,245 +.byte 188,182,218,33,16,255,243,210 +.byte 205,12,19,236,95,151,68,23 +.byte 196,167,126,61,100,93,25,115 +.byte 96,129,79,220,34,42,144,136 +.byte 70,238,184,20,222,94,11,219 +.byte 224,50,58,10,73,6,36,92 +.byte 194,211,172,98,145,149,228,121 +.byte 231,200,55,109,141,213,78,169 +.byte 108,86,244,234,101,122,174,8 +.byte 186,120,37,46,28,166,180,198 +.byte 232,221,116,31,75,189,139,138 +.byte 112,62,181,102,72,3,246,14 +.byte 97,53,87,185,134,193,29,158 +.byte 225,248,152,17,105,217,142,148 +.byte 155,30,135,233,206,85,40,223 +.byte 140,161,137,13,191,230,66,104 +.byte 65,153,45,15,176,84,187,22 +.byte 99,124,119,123,242,107,111,197 +.byte 48,1,103,43,254,215,171,118 +.byte 202,130,201,125,250,89,71,240 +.byte 173,212,162,175,156,164,114,192 +.byte 183,253,147,38,54,63,247,204 +.byte 52,165,229,241,113,216,49,21 +.byte 4,199,35,195,24,150,5,154 +.byte 7,18,128,226,235,39,178,117 +.byte 9,131,44,26,27,110,90,160 +.byte 82,59,214,179,41,227,47,132 +.byte 83,209,0,237,32,252,177,91 +.byte 106,203,190,57,74,76,88,207 +.byte 208,239,170,251,67,77,51,133 +.byte 69,249,2,127,80,60,159,168 +.byte 81,163,64,143,146,157,56,245 +.byte 188,182,218,33,16,255,243,210 +.byte 205,12,19,236,95,151,68,23 +.byte 196,167,126,61,100,93,25,115 +.byte 96,129,79,220,34,42,144,136 +.byte 70,238,184,20,222,94,11,219 +.byte 224,50,58,10,73,6,36,92 +.byte 194,211,172,98,145,149,228,121 +.byte 231,200,55,109,141,213,78,169 +.byte 108,86,244,234,101,122,174,8 +.byte 186,120,37,46,28,166,180,198 +.byte 232,221,116,31,75,189,139,138 +.byte 112,62,181,102,72,3,246,14 +.byte 97,53,87,185,134,193,29,158 +.byte 225,248,152,17,105,217,142,148 +.byte 155,30,135,233,206,85,40,223 +.byte 140,161,137,13,191,230,66,104 +.byte 65,153,45,15,176,84,187,22 +.byte 99,124,119,123,242,107,111,197 +.byte 48,1,103,43,254,215,171,118 +.byte 202,130,201,125,250,89,71,240 +.byte 173,212,162,175,156,164,114,192 +.byte 183,253,147,38,54,63,247,204 +.byte 52,165,229,241,113,216,49,21 +.byte 4,199,35,195,24,150,5,154 +.byte 7,18,128,226,235,39,178,117 +.byte 9,131,44,26,27,110,90,160 +.byte 82,59,214,179,41,227,47,132 +.byte 83,209,0,237,32,252,177,91 +.byte 106,203,190,57,74,76,88,207 +.byte 208,239,170,251,67,77,51,133 +.byte 69,249,2,127,80,60,159,168 +.byte 81,163,64,143,146,157,56,245 +.byte 188,182,218,33,16,255,243,210 +.byte 205,12,19,236,95,151,68,23 +.byte 196,167,126,61,100,93,25,115 +.byte 96,129,79,220,34,42,144,136 +.byte 70,238,184,20,222,94,11,219 +.byte 224,50,58,10,73,6,36,92 +.byte 194,211,172,98,145,149,228,121 +.byte 231,200,55,109,141,213,78,169 +.byte 108,86,244,234,101,122,174,8 +.byte 186,120,37,46,28,166,180,198 +.byte 232,221,116,31,75,189,139,138 +.byte 112,62,181,102,72,3,246,14 +.byte 97,53,87,185,134,193,29,158 +.byte 225,248,152,17,105,217,142,148 +.byte 155,30,135,233,206,85,40,223 +.byte 140,161,137,13,191,230,66,104 +.byte 65,153,45,15,176,84,187,22 +.byte 99,124,119,123,242,107,111,197 +.byte 48,1,103,43,254,215,171,118 +.byte 202,130,201,125,250,89,71,240 +.byte 173,212,162,175,156,164,114,192 +.byte 183,253,147,38,54,63,247,204 +.byte 52,165,229,241,113,216,49,21 +.byte 4,199,35,195,24,150,5,154 +.byte 7,18,128,226,235,39,178,117 +.byte 9,131,44,26,27,110,90,160 +.byte 82,59,214,179,41,227,47,132 +.byte 83,209,0,237,32,252,177,91 +.byte 106,203,190,57,74,76,88,207 +.byte 208,239,170,251,67,77,51,133 +.byte 69,249,2,127,80,60,159,168 +.byte 81,163,64,143,146,157,56,245 +.byte 188,182,218,33,16,255,243,210 +.byte 205,12,19,236,95,151,68,23 +.byte 196,167,126,61,100,93,25,115 +.byte 96,129,79,220,34,42,144,136 +.byte 70,238,184,20,222,94,11,219 +.byte 224,50,58,10,73,6,36,92 +.byte 194,211,172,98,145,149,228,121 +.byte 231,200,55,109,141,213,78,169 +.byte 108,86,244,234,101,122,174,8 +.byte 186,120,37,46,28,166,180,198 +.byte 232,221,116,31,75,189,139,138 +.byte 112,62,181,102,72,3,246,14 +.byte 97,53,87,185,134,193,29,158 +.byte 225,248,152,17,105,217,142,148 +.byte 155,30,135,233,206,85,40,223 +.byte 140,161,137,13,191,230,66,104 +.byte 65,153,45,15,176,84,187,22 +.long 1,2,4,8 +.long 16,32,64,128 +.long 27,54,0,0 +.long 0,0,0,0 +.size _x86_AES_encrypt,.-_x86_AES_encrypt +.globl AES_encrypt +.type AES_encrypt,@function +.align 16 +AES_encrypt: +.L_AES_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%esi + movl 28(%esp),%edi + movl %esp,%eax + subl $36,%esp + andl $-64,%esp + leal -127(%edi),%ebx + subl %esp,%ebx + negl %ebx + andl $960,%ebx + subl %ebx,%esp + addl $4,%esp + movl %eax,28(%esp) + call .L004pic_point +.L004pic_point: + popl %ebp + leal OPENSSL_ia32cap_P,%eax + leal .LAES_Te-.L004pic_point(%ebp),%ebp + leal 764(%esp),%ebx + subl %ebp,%ebx + andl $768,%ebx + leal 2176(%ebp,%ebx,1),%ebp + btl $25,(%eax) + jnc .L005x86 + movq (%esi),%mm0 + movq 8(%esi),%mm4 + call _sse_AES_encrypt_compact + movl 28(%esp),%esp + movl 24(%esp),%esi + movq %mm0,(%esi) + movq %mm4,8(%esi) + emms + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.align 16 +.L005x86: + movl %ebp,24(%esp) + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edx + call _x86_AES_encrypt_compact + movl 28(%esp),%esp + movl 24(%esp),%esi + movl %eax,(%esi) + movl %ebx,4(%esi) + movl %ecx,8(%esi) + movl %edx,12(%esi) + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size AES_encrypt,.-.L_AES_encrypt_begin +.type _x86_AES_decrypt_compact,@function +.align 16 +_x86_AES_decrypt_compact: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + + movl %edi,20(%esp) + xorl (%edi),%eax + xorl 4(%edi),%ebx + xorl 8(%edi),%ecx + xorl 12(%edi),%edx + movl 240(%edi),%esi + leal -2(%esi,%esi,1),%esi + leal (%edi,%esi,8),%esi + movl %esi,24(%esp) + movl -128(%ebp),%edi + movl -96(%ebp),%esi + movl -64(%ebp),%edi + movl -32(%ebp),%esi + movl (%ebp),%edi + movl 32(%ebp),%esi + movl 64(%ebp),%edi + movl 96(%ebp),%esi +.align 16 +.L006loop: + movl %eax,%esi + andl $255,%esi + movzbl -128(%ebp,%esi,1),%esi + movzbl %dh,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $8,%edi + xorl %edi,%esi + movl %ecx,%edi + shrl $16,%edi + andl $255,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $16,%edi + xorl %edi,%esi + movl %ebx,%edi + shrl $24,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $24,%edi + xorl %edi,%esi + movl %esi,4(%esp) + movl %ebx,%esi + andl $255,%esi + movzbl -128(%ebp,%esi,1),%esi + movzbl %ah,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $8,%edi + xorl %edi,%esi + movl %edx,%edi + shrl $16,%edi + andl $255,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $16,%edi + xorl %edi,%esi + movl %ecx,%edi + shrl $24,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $24,%edi + xorl %edi,%esi + movl %esi,8(%esp) + movl %ecx,%esi + andl $255,%esi + movzbl -128(%ebp,%esi,1),%esi + movzbl %bh,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $8,%edi + xorl %edi,%esi + movl %eax,%edi + shrl $16,%edi + andl $255,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $16,%edi + xorl %edi,%esi + movl %edx,%edi + shrl $24,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $24,%edi + xorl %edi,%esi + andl $255,%edx + movzbl -128(%ebp,%edx,1),%edx + movzbl %ch,%ecx + movzbl -128(%ebp,%ecx,1),%ecx + shll $8,%ecx + xorl %ecx,%edx + movl %esi,%ecx + shrl $16,%ebx + andl $255,%ebx + movzbl -128(%ebp,%ebx,1),%ebx + shll $16,%ebx + xorl %ebx,%edx + shrl $24,%eax + movzbl -128(%ebp,%eax,1),%eax + shll $24,%eax + xorl %eax,%edx + movl $2155905152,%edi + andl %ecx,%edi + movl %edi,%esi + shrl $7,%edi + leal (%ecx,%ecx,1),%eax + subl %edi,%esi + andl $4278124286,%eax + andl $454761243,%esi + xorl %esi,%eax + movl $2155905152,%edi + andl %eax,%edi + movl %edi,%esi + shrl $7,%edi + leal (%eax,%eax,1),%ebx + subl %edi,%esi + andl $4278124286,%ebx + andl $454761243,%esi + xorl %ecx,%eax + xorl %esi,%ebx + movl $2155905152,%edi + andl %ebx,%edi + movl %edi,%esi + shrl $7,%edi + leal (%ebx,%ebx,1),%ebp + subl %edi,%esi + andl $4278124286,%ebp + andl $454761243,%esi + xorl %ecx,%ebx + roll $8,%ecx + xorl %esi,%ebp + xorl %eax,%ecx + xorl %ebp,%eax + xorl %ebx,%ecx + xorl %ebp,%ebx + roll $24,%eax + xorl %ebp,%ecx + roll $16,%ebx + xorl %eax,%ecx + roll $8,%ebp + xorl %ebx,%ecx + movl 4(%esp),%eax + xorl %ebp,%ecx + movl %ecx,12(%esp) + movl $2155905152,%edi + andl %edx,%edi + movl %edi,%esi + shrl $7,%edi + leal (%edx,%edx,1),%ebx + subl %edi,%esi + andl $4278124286,%ebx + andl $454761243,%esi + xorl %esi,%ebx + movl $2155905152,%edi + andl %ebx,%edi + movl %edi,%esi + shrl $7,%edi + leal (%ebx,%ebx,1),%ecx + subl %edi,%esi + andl $4278124286,%ecx + andl $454761243,%esi + xorl %edx,%ebx + xorl %esi,%ecx + movl $2155905152,%edi + andl %ecx,%edi + movl %edi,%esi + shrl $7,%edi + leal (%ecx,%ecx,1),%ebp + subl %edi,%esi + andl $4278124286,%ebp + andl $454761243,%esi + xorl %edx,%ecx + roll $8,%edx + xorl %esi,%ebp + xorl %ebx,%edx + xorl %ebp,%ebx + xorl %ecx,%edx + xorl %ebp,%ecx + roll $24,%ebx + xorl %ebp,%edx + roll $16,%ecx + xorl %ebx,%edx + roll $8,%ebp + xorl %ecx,%edx + movl 8(%esp),%ebx + xorl %ebp,%edx + movl %edx,16(%esp) + movl $2155905152,%edi + andl %eax,%edi + movl %edi,%esi + shrl $7,%edi + leal (%eax,%eax,1),%ecx + subl %edi,%esi + andl $4278124286,%ecx + andl $454761243,%esi + xorl %esi,%ecx + movl $2155905152,%edi + andl %ecx,%edi + movl %edi,%esi + shrl $7,%edi + leal (%ecx,%ecx,1),%edx + subl %edi,%esi + andl $4278124286,%edx + andl $454761243,%esi + xorl %eax,%ecx + xorl %esi,%edx + movl $2155905152,%edi + andl %edx,%edi + movl %edi,%esi + shrl $7,%edi + leal (%edx,%edx,1),%ebp + subl %edi,%esi + andl $4278124286,%ebp + andl $454761243,%esi + xorl %eax,%edx + roll $8,%eax + xorl %esi,%ebp + xorl %ecx,%eax + xorl %ebp,%ecx + xorl %edx,%eax + xorl %ebp,%edx + roll $24,%ecx + xorl %ebp,%eax + roll $16,%edx + xorl %ecx,%eax + roll $8,%ebp + xorl %edx,%eax + xorl %ebp,%eax + movl $2155905152,%edi + andl %ebx,%edi + movl %edi,%esi + shrl $7,%edi + leal (%ebx,%ebx,1),%ecx + subl %edi,%esi + andl $4278124286,%ecx + andl $454761243,%esi + xorl %esi,%ecx + movl $2155905152,%edi + andl %ecx,%edi + movl %edi,%esi + shrl $7,%edi + leal (%ecx,%ecx,1),%edx + subl %edi,%esi + andl $4278124286,%edx + andl $454761243,%esi + xorl %ebx,%ecx + xorl %esi,%edx + movl $2155905152,%edi + andl %edx,%edi + movl %edi,%esi + shrl $7,%edi + leal (%edx,%edx,1),%ebp + subl %edi,%esi + andl $4278124286,%ebp + andl $454761243,%esi + xorl %ebx,%edx + roll $8,%ebx + xorl %esi,%ebp + xorl %ecx,%ebx + xorl %ebp,%ecx + xorl %edx,%ebx + xorl %ebp,%edx + roll $24,%ecx + xorl %ebp,%ebx + roll $16,%edx + xorl %ecx,%ebx + roll $8,%ebp + xorl %edx,%ebx + movl 12(%esp),%ecx + xorl %ebp,%ebx + movl 16(%esp),%edx + movl 20(%esp),%edi + movl 28(%esp),%ebp + addl $16,%edi + xorl (%edi),%eax + xorl 4(%edi),%ebx + xorl 8(%edi),%ecx + xorl 12(%edi),%edx + cmpl 24(%esp),%edi + movl %edi,20(%esp) + jb .L006loop + movl %eax,%esi + andl $255,%esi + movzbl -128(%ebp,%esi,1),%esi + movzbl %dh,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $8,%edi + xorl %edi,%esi + movl %ecx,%edi + shrl $16,%edi + andl $255,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $16,%edi + xorl %edi,%esi + movl %ebx,%edi + shrl $24,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $24,%edi + xorl %edi,%esi + movl %esi,4(%esp) + movl %ebx,%esi + andl $255,%esi + movzbl -128(%ebp,%esi,1),%esi + movzbl %ah,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $8,%edi + xorl %edi,%esi + movl %edx,%edi + shrl $16,%edi + andl $255,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $16,%edi + xorl %edi,%esi + movl %ecx,%edi + shrl $24,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $24,%edi + xorl %edi,%esi + movl %esi,8(%esp) + movl %ecx,%esi + andl $255,%esi + movzbl -128(%ebp,%esi,1),%esi + movzbl %bh,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $8,%edi + xorl %edi,%esi + movl %eax,%edi + shrl $16,%edi + andl $255,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $16,%edi + xorl %edi,%esi + movl %edx,%edi + shrl $24,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $24,%edi + xorl %edi,%esi + movl 20(%esp),%edi + andl $255,%edx + movzbl -128(%ebp,%edx,1),%edx + movzbl %ch,%ecx + movzbl -128(%ebp,%ecx,1),%ecx + shll $8,%ecx + xorl %ecx,%edx + movl %esi,%ecx + shrl $16,%ebx + andl $255,%ebx + movzbl -128(%ebp,%ebx,1),%ebx + shll $16,%ebx + xorl %ebx,%edx + movl 8(%esp),%ebx + shrl $24,%eax + movzbl -128(%ebp,%eax,1),%eax + shll $24,%eax + xorl %eax,%edx + movl 4(%esp),%eax + xorl 16(%edi),%eax + xorl 20(%edi),%ebx + xorl 24(%edi),%ecx + xorl 28(%edi),%edx + ret +.size _x86_AES_decrypt_compact,.-_x86_AES_decrypt_compact +.type _sse_AES_decrypt_compact,@function +.align 16 +_sse_AES_decrypt_compact: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + + pxor (%edi),%mm0 + pxor 8(%edi),%mm4 + movl 240(%edi),%esi + leal -2(%esi,%esi,1),%esi + leal (%edi,%esi,8),%esi + movl %esi,24(%esp) + movl $454761243,%eax + movl %eax,8(%esp) + movl %eax,12(%esp) + movl -128(%ebp),%eax + movl -96(%ebp),%ebx + movl -64(%ebp),%ecx + movl -32(%ebp),%edx + movl (%ebp),%eax + movl 32(%ebp),%ebx + movl 64(%ebp),%ecx + movl 96(%ebp),%edx +.align 16 +.L007loop: + pshufw $12,%mm0,%mm1 + pshufw $9,%mm4,%mm5 + movd %mm1,%eax + movd %mm5,%ebx + movl %edi,20(%esp) + movzbl %al,%esi + movzbl %ah,%edx + pshufw $6,%mm0,%mm2 + movzbl -128(%ebp,%esi,1),%ecx + movzbl %bl,%edi + movzbl -128(%ebp,%edx,1),%edx + shrl $16,%eax + shll $8,%edx + movzbl -128(%ebp,%edi,1),%esi + movzbl %bh,%edi + shll $16,%esi + pshufw $3,%mm4,%mm6 + orl %esi,%ecx + movzbl -128(%ebp,%edi,1),%esi + movzbl %ah,%edi + shll $24,%esi + shrl $16,%ebx + orl %esi,%edx + movzbl -128(%ebp,%edi,1),%esi + movzbl %bh,%edi + shll $24,%esi + orl %esi,%ecx + movzbl -128(%ebp,%edi,1),%esi + movzbl %al,%edi + shll $8,%esi + movd %mm2,%eax + orl %esi,%ecx + movzbl -128(%ebp,%edi,1),%esi + movzbl %bl,%edi + shll $16,%esi + movd %mm6,%ebx + movd %ecx,%mm0 + movzbl -128(%ebp,%edi,1),%ecx + movzbl %al,%edi + orl %esi,%ecx + movzbl -128(%ebp,%edi,1),%esi + movzbl %bl,%edi + orl %esi,%edx + movzbl -128(%ebp,%edi,1),%esi + movzbl %ah,%edi + shll $16,%esi + shrl $16,%eax + orl %esi,%edx + movzbl -128(%ebp,%edi,1),%esi + movzbl %bh,%edi + shrl $16,%ebx + shll $8,%esi + movd %edx,%mm1 + movzbl -128(%ebp,%edi,1),%edx + movzbl %bh,%edi + shll $24,%edx + andl $255,%ebx + orl %esi,%edx + punpckldq %mm1,%mm0 + movzbl -128(%ebp,%edi,1),%esi + movzbl %al,%edi + shll $8,%esi + movzbl %ah,%eax + movzbl -128(%ebp,%ebx,1),%ebx + orl %esi,%ecx + movzbl -128(%ebp,%edi,1),%esi + orl %ebx,%edx + shll $16,%esi + movzbl -128(%ebp,%eax,1),%eax + orl %esi,%edx + shll $24,%eax + orl %eax,%ecx + movl 20(%esp),%edi + movd %edx,%mm4 + movd %ecx,%mm5 + punpckldq %mm5,%mm4 + addl $16,%edi + cmpl 24(%esp),%edi + ja .L008out + movq %mm0,%mm3 + movq %mm4,%mm7 + pshufw $228,%mm0,%mm2 + pshufw $228,%mm4,%mm6 + movq %mm0,%mm1 + movq %mm4,%mm5 + pshufw $177,%mm0,%mm0 + pshufw $177,%mm4,%mm4 + pslld $8,%mm2 + pslld $8,%mm6 + psrld $8,%mm3 + psrld $8,%mm7 + pxor %mm2,%mm0 + pxor %mm6,%mm4 + pxor %mm3,%mm0 + pxor %mm7,%mm4 + pslld $16,%mm2 + pslld $16,%mm6 + psrld $16,%mm3 + psrld $16,%mm7 + pxor %mm2,%mm0 + pxor %mm6,%mm4 + pxor %mm3,%mm0 + pxor %mm7,%mm4 + movq 8(%esp),%mm3 + pxor %mm2,%mm2 + pxor %mm6,%mm6 + pcmpgtb %mm1,%mm2 + pcmpgtb %mm5,%mm6 + pand %mm3,%mm2 + pand %mm3,%mm6 + paddb %mm1,%mm1 + paddb %mm5,%mm5 + pxor %mm2,%mm1 + pxor %mm6,%mm5 + movq %mm1,%mm3 + movq %mm5,%mm7 + movq %mm1,%mm2 + movq %mm5,%mm6 + pxor %mm1,%mm0 + pxor %mm5,%mm4 + pslld $24,%mm3 + pslld $24,%mm7 + psrld $8,%mm2 + psrld $8,%mm6 + pxor %mm3,%mm0 + pxor %mm7,%mm4 + pxor %mm2,%mm0 + pxor %mm6,%mm4 + movq 8(%esp),%mm2 + pxor %mm3,%mm3 + pxor %mm7,%mm7 + pcmpgtb %mm1,%mm3 + pcmpgtb %mm5,%mm7 + pand %mm2,%mm3 + pand %mm2,%mm7 + paddb %mm1,%mm1 + paddb %mm5,%mm5 + pxor %mm3,%mm1 + pxor %mm7,%mm5 + pshufw $177,%mm1,%mm3 + pshufw $177,%mm5,%mm7 + pxor %mm1,%mm0 + pxor %mm5,%mm4 + pxor %mm3,%mm0 + pxor %mm7,%mm4 + pxor %mm3,%mm3 + pxor %mm7,%mm7 + pcmpgtb %mm1,%mm3 + pcmpgtb %mm5,%mm7 + pand %mm2,%mm3 + pand %mm2,%mm7 + paddb %mm1,%mm1 + paddb %mm5,%mm5 + pxor %mm3,%mm1 + pxor %mm7,%mm5 + pxor %mm1,%mm0 + pxor %mm5,%mm4 + movq %mm1,%mm3 + movq %mm5,%mm7 + pshufw $177,%mm1,%mm2 + pshufw $177,%mm5,%mm6 + pxor %mm2,%mm0 + pxor %mm6,%mm4 + pslld $8,%mm1 + pslld $8,%mm5 + psrld $8,%mm3 + psrld $8,%mm7 + movq (%edi),%mm2 + movq 8(%edi),%mm6 + pxor %mm1,%mm0 + pxor %mm5,%mm4 + pxor %mm3,%mm0 + pxor %mm7,%mm4 + movl -128(%ebp),%eax + pslld $16,%mm1 + pslld $16,%mm5 + movl -64(%ebp),%ebx + psrld $16,%mm3 + psrld $16,%mm7 + movl (%ebp),%ecx + pxor %mm1,%mm0 + pxor %mm5,%mm4 + movl 64(%ebp),%edx + pxor %mm3,%mm0 + pxor %mm7,%mm4 + pxor %mm2,%mm0 + pxor %mm6,%mm4 + jmp .L007loop +.align 16 +.L008out: + pxor (%edi),%mm0 + pxor 8(%edi),%mm4 + ret +.size _sse_AES_decrypt_compact,.-_sse_AES_decrypt_compact +.type _x86_AES_decrypt,@function +.align 16 +_x86_AES_decrypt: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + + movl %edi,20(%esp) + xorl (%edi),%eax + xorl 4(%edi),%ebx + xorl 8(%edi),%ecx + xorl 12(%edi),%edx + movl 240(%edi),%esi + leal -2(%esi,%esi,1),%esi + leal (%edi,%esi,8),%esi + movl %esi,24(%esp) +.align 16 +.L009loop: + movl %eax,%esi + andl $255,%esi + movl (%ebp,%esi,8),%esi + movzbl %dh,%edi + xorl 3(%ebp,%edi,8),%esi + movl %ecx,%edi + shrl $16,%edi + andl $255,%edi + xorl 2(%ebp,%edi,8),%esi + movl %ebx,%edi + shrl $24,%edi + xorl 1(%ebp,%edi,8),%esi + movl %esi,4(%esp) + + movl %ebx,%esi + andl $255,%esi + movl (%ebp,%esi,8),%esi + movzbl %ah,%edi + xorl 3(%ebp,%edi,8),%esi + movl %edx,%edi + shrl $16,%edi + andl $255,%edi + xorl 2(%ebp,%edi,8),%esi + movl %ecx,%edi + shrl $24,%edi + xorl 1(%ebp,%edi,8),%esi + movl %esi,8(%esp) + + movl %ecx,%esi + andl $255,%esi + movl (%ebp,%esi,8),%esi + movzbl %bh,%edi + xorl 3(%ebp,%edi,8),%esi + movl %eax,%edi + shrl $16,%edi + andl $255,%edi + xorl 2(%ebp,%edi,8),%esi + movl %edx,%edi + shrl $24,%edi + xorl 1(%ebp,%edi,8),%esi + + movl 20(%esp),%edi + andl $255,%edx + movl (%ebp,%edx,8),%edx + movzbl %ch,%ecx + xorl 3(%ebp,%ecx,8),%edx + movl %esi,%ecx + shrl $16,%ebx + andl $255,%ebx + xorl 2(%ebp,%ebx,8),%edx + movl 8(%esp),%ebx + shrl $24,%eax + xorl 1(%ebp,%eax,8),%edx + movl 4(%esp),%eax + + addl $16,%edi + xorl (%edi),%eax + xorl 4(%edi),%ebx + xorl 8(%edi),%ecx + xorl 12(%edi),%edx + cmpl 24(%esp),%edi + movl %edi,20(%esp) + jb .L009loop + leal 2176(%ebp),%ebp + movl -128(%ebp),%edi + movl -96(%ebp),%esi + movl -64(%ebp),%edi + movl -32(%ebp),%esi + movl (%ebp),%edi + movl 32(%ebp),%esi + movl 64(%ebp),%edi + movl 96(%ebp),%esi + leal -128(%ebp),%ebp + movl %eax,%esi + andl $255,%esi + movzbl (%ebp,%esi,1),%esi + movzbl %dh,%edi + movzbl (%ebp,%edi,1),%edi + shll $8,%edi + xorl %edi,%esi + movl %ecx,%edi + shrl $16,%edi + andl $255,%edi + movzbl (%ebp,%edi,1),%edi + shll $16,%edi + xorl %edi,%esi + movl %ebx,%edi + shrl $24,%edi + movzbl (%ebp,%edi,1),%edi + shll $24,%edi + xorl %edi,%esi + movl %esi,4(%esp) + movl %ebx,%esi + andl $255,%esi + movzbl (%ebp,%esi,1),%esi + movzbl %ah,%edi + movzbl (%ebp,%edi,1),%edi + shll $8,%edi + xorl %edi,%esi + movl %edx,%edi + shrl $16,%edi + andl $255,%edi + movzbl (%ebp,%edi,1),%edi + shll $16,%edi + xorl %edi,%esi + movl %ecx,%edi + shrl $24,%edi + movzbl (%ebp,%edi,1),%edi + shll $24,%edi + xorl %edi,%esi + movl %esi,8(%esp) + movl %ecx,%esi + andl $255,%esi + movzbl (%ebp,%esi,1),%esi + movzbl %bh,%edi + movzbl (%ebp,%edi,1),%edi + shll $8,%edi + xorl %edi,%esi + movl %eax,%edi + shrl $16,%edi + andl $255,%edi + movzbl (%ebp,%edi,1),%edi + shll $16,%edi + xorl %edi,%esi + movl %edx,%edi + shrl $24,%edi + movzbl (%ebp,%edi,1),%edi + shll $24,%edi + xorl %edi,%esi + movl 20(%esp),%edi + andl $255,%edx + movzbl (%ebp,%edx,1),%edx + movzbl %ch,%ecx + movzbl (%ebp,%ecx,1),%ecx + shll $8,%ecx + xorl %ecx,%edx + movl %esi,%ecx + shrl $16,%ebx + andl $255,%ebx + movzbl (%ebp,%ebx,1),%ebx + shll $16,%ebx + xorl %ebx,%edx + movl 8(%esp),%ebx + shrl $24,%eax + movzbl (%ebp,%eax,1),%eax + shll $24,%eax + xorl %eax,%edx + movl 4(%esp),%eax + leal -2048(%ebp),%ebp + addl $16,%edi + xorl (%edi),%eax + xorl 4(%edi),%ebx + xorl 8(%edi),%ecx + xorl 12(%edi),%edx + ret +.align 64 +.LAES_Td: +.long 1353184337,1353184337 +.long 1399144830,1399144830 +.long 3282310938,3282310938 +.long 2522752826,2522752826 +.long 3412831035,3412831035 +.long 4047871263,4047871263 +.long 2874735276,2874735276 +.long 2466505547,2466505547 +.long 1442459680,1442459680 +.long 4134368941,4134368941 +.long 2440481928,2440481928 +.long 625738485,625738485 +.long 4242007375,4242007375 +.long 3620416197,3620416197 +.long 2151953702,2151953702 +.long 2409849525,2409849525 +.long 1230680542,1230680542 +.long 1729870373,1729870373 +.long 2551114309,2551114309 +.long 3787521629,3787521629 +.long 41234371,41234371 +.long 317738113,317738113 +.long 2744600205,2744600205 +.long 3338261355,3338261355 +.long 3881799427,3881799427 +.long 2510066197,2510066197 +.long 3950669247,3950669247 +.long 3663286933,3663286933 +.long 763608788,763608788 +.long 3542185048,3542185048 +.long 694804553,694804553 +.long 1154009486,1154009486 +.long 1787413109,1787413109 +.long 2021232372,2021232372 +.long 1799248025,1799248025 +.long 3715217703,3715217703 +.long 3058688446,3058688446 +.long 397248752,397248752 +.long 1722556617,1722556617 +.long 3023752829,3023752829 +.long 407560035,407560035 +.long 2184256229,2184256229 +.long 1613975959,1613975959 +.long 1165972322,1165972322 +.long 3765920945,3765920945 +.long 2226023355,2226023355 +.long 480281086,480281086 +.long 2485848313,2485848313 +.long 1483229296,1483229296 +.long 436028815,436028815 +.long 2272059028,2272059028 +.long 3086515026,3086515026 +.long 601060267,601060267 +.long 3791801202,3791801202 +.long 1468997603,1468997603 +.long 715871590,715871590 +.long 120122290,120122290 +.long 63092015,63092015 +.long 2591802758,2591802758 +.long 2768779219,2768779219 +.long 4068943920,4068943920 +.long 2997206819,2997206819 +.long 3127509762,3127509762 +.long 1552029421,1552029421 +.long 723308426,723308426 +.long 2461301159,2461301159 +.long 4042393587,4042393587 +.long 2715969870,2715969870 +.long 3455375973,3455375973 +.long 3586000134,3586000134 +.long 526529745,526529745 +.long 2331944644,2331944644 +.long 2639474228,2639474228 +.long 2689987490,2689987490 +.long 853641733,853641733 +.long 1978398372,1978398372 +.long 971801355,971801355 +.long 2867814464,2867814464 +.long 111112542,111112542 +.long 1360031421,1360031421 +.long 4186579262,4186579262 +.long 1023860118,1023860118 +.long 2919579357,2919579357 +.long 1186850381,1186850381 +.long 3045938321,3045938321 +.long 90031217,90031217 +.long 1876166148,1876166148 +.long 4279586912,4279586912 +.long 620468249,620468249 +.long 2548678102,2548678102 +.long 3426959497,3426959497 +.long 2006899047,2006899047 +.long 3175278768,3175278768 +.long 2290845959,2290845959 +.long 945494503,945494503 +.long 3689859193,3689859193 +.long 1191869601,1191869601 +.long 3910091388,3910091388 +.long 3374220536,3374220536 +.long 0,0 +.long 2206629897,2206629897 +.long 1223502642,1223502642 +.long 2893025566,2893025566 +.long 1316117100,1316117100 +.long 4227796733,4227796733 +.long 1446544655,1446544655 +.long 517320253,517320253 +.long 658058550,658058550 +.long 1691946762,1691946762 +.long 564550760,564550760 +.long 3511966619,3511966619 +.long 976107044,976107044 +.long 2976320012,2976320012 +.long 266819475,266819475 +.long 3533106868,3533106868 +.long 2660342555,2660342555 +.long 1338359936,1338359936 +.long 2720062561,2720062561 +.long 1766553434,1766553434 +.long 370807324,370807324 +.long 179999714,179999714 +.long 3844776128,3844776128 +.long 1138762300,1138762300 +.long 488053522,488053522 +.long 185403662,185403662 +.long 2915535858,2915535858 +.long 3114841645,3114841645 +.long 3366526484,3366526484 +.long 2233069911,2233069911 +.long 1275557295,1275557295 +.long 3151862254,3151862254 +.long 4250959779,4250959779 +.long 2670068215,2670068215 +.long 3170202204,3170202204 +.long 3309004356,3309004356 +.long 880737115,880737115 +.long 1982415755,1982415755 +.long 3703972811,3703972811 +.long 1761406390,1761406390 +.long 1676797112,1676797112 +.long 3403428311,3403428311 +.long 277177154,277177154 +.long 1076008723,1076008723 +.long 538035844,538035844 +.long 2099530373,2099530373 +.long 4164795346,4164795346 +.long 288553390,288553390 +.long 1839278535,1839278535 +.long 1261411869,1261411869 +.long 4080055004,4080055004 +.long 3964831245,3964831245 +.long 3504587127,3504587127 +.long 1813426987,1813426987 +.long 2579067049,2579067049 +.long 4199060497,4199060497 +.long 577038663,577038663 +.long 3297574056,3297574056 +.long 440397984,440397984 +.long 3626794326,3626794326 +.long 4019204898,4019204898 +.long 3343796615,3343796615 +.long 3251714265,3251714265 +.long 4272081548,4272081548 +.long 906744984,906744984 +.long 3481400742,3481400742 +.long 685669029,685669029 +.long 646887386,646887386 +.long 2764025151,2764025151 +.long 3835509292,3835509292 +.long 227702864,227702864 +.long 2613862250,2613862250 +.long 1648787028,1648787028 +.long 3256061430,3256061430 +.long 3904428176,3904428176 +.long 1593260334,1593260334 +.long 4121936770,4121936770 +.long 3196083615,3196083615 +.long 2090061929,2090061929 +.long 2838353263,2838353263 +.long 3004310991,3004310991 +.long 999926984,999926984 +.long 2809993232,2809993232 +.long 1852021992,1852021992 +.long 2075868123,2075868123 +.long 158869197,158869197 +.long 4095236462,4095236462 +.long 28809964,28809964 +.long 2828685187,2828685187 +.long 1701746150,1701746150 +.long 2129067946,2129067946 +.long 147831841,147831841 +.long 3873969647,3873969647 +.long 3650873274,3650873274 +.long 3459673930,3459673930 +.long 3557400554,3557400554 +.long 3598495785,3598495785 +.long 2947720241,2947720241 +.long 824393514,824393514 +.long 815048134,815048134 +.long 3227951669,3227951669 +.long 935087732,935087732 +.long 2798289660,2798289660 +.long 2966458592,2966458592 +.long 366520115,366520115 +.long 1251476721,1251476721 +.long 4158319681,4158319681 +.long 240176511,240176511 +.long 804688151,804688151 +.long 2379631990,2379631990 +.long 1303441219,1303441219 +.long 1414376140,1414376140 +.long 3741619940,3741619940 +.long 3820343710,3820343710 +.long 461924940,461924940 +.long 3089050817,3089050817 +.long 2136040774,2136040774 +.long 82468509,82468509 +.long 1563790337,1563790337 +.long 1937016826,1937016826 +.long 776014843,776014843 +.long 1511876531,1511876531 +.long 1389550482,1389550482 +.long 861278441,861278441 +.long 323475053,323475053 +.long 2355222426,2355222426 +.long 2047648055,2047648055 +.long 2383738969,2383738969 +.long 2302415851,2302415851 +.long 3995576782,3995576782 +.long 902390199,902390199 +.long 3991215329,3991215329 +.long 1018251130,1018251130 +.long 1507840668,1507840668 +.long 1064563285,1064563285 +.long 2043548696,2043548696 +.long 3208103795,3208103795 +.long 3939366739,3939366739 +.long 1537932639,1537932639 +.long 342834655,342834655 +.long 2262516856,2262516856 +.long 2180231114,2180231114 +.long 1053059257,1053059257 +.long 741614648,741614648 +.long 1598071746,1598071746 +.long 1925389590,1925389590 +.long 203809468,203809468 +.long 2336832552,2336832552 +.long 1100287487,1100287487 +.long 1895934009,1895934009 +.long 3736275976,3736275976 +.long 2632234200,2632234200 +.long 2428589668,2428589668 +.long 1636092795,1636092795 +.long 1890988757,1890988757 +.long 1952214088,1952214088 +.long 1113045200,1113045200 +.byte 82,9,106,213,48,54,165,56 +.byte 191,64,163,158,129,243,215,251 +.byte 124,227,57,130,155,47,255,135 +.byte 52,142,67,68,196,222,233,203 +.byte 84,123,148,50,166,194,35,61 +.byte 238,76,149,11,66,250,195,78 +.byte 8,46,161,102,40,217,36,178 +.byte 118,91,162,73,109,139,209,37 +.byte 114,248,246,100,134,104,152,22 +.byte 212,164,92,204,93,101,182,146 +.byte 108,112,72,80,253,237,185,218 +.byte 94,21,70,87,167,141,157,132 +.byte 144,216,171,0,140,188,211,10 +.byte 247,228,88,5,184,179,69,6 +.byte 208,44,30,143,202,63,15,2 +.byte 193,175,189,3,1,19,138,107 +.byte 58,145,17,65,79,103,220,234 +.byte 151,242,207,206,240,180,230,115 +.byte 150,172,116,34,231,173,53,133 +.byte 226,249,55,232,28,117,223,110 +.byte 71,241,26,113,29,41,197,137 +.byte 111,183,98,14,170,24,190,27 +.byte 252,86,62,75,198,210,121,32 +.byte 154,219,192,254,120,205,90,244 +.byte 31,221,168,51,136,7,199,49 +.byte 177,18,16,89,39,128,236,95 +.byte 96,81,127,169,25,181,74,13 +.byte 45,229,122,159,147,201,156,239 +.byte 160,224,59,77,174,42,245,176 +.byte 200,235,187,60,131,83,153,97 +.byte 23,43,4,126,186,119,214,38 +.byte 225,105,20,99,85,33,12,125 +.byte 82,9,106,213,48,54,165,56 +.byte 191,64,163,158,129,243,215,251 +.byte 124,227,57,130,155,47,255,135 +.byte 52,142,67,68,196,222,233,203 +.byte 84,123,148,50,166,194,35,61 +.byte 238,76,149,11,66,250,195,78 +.byte 8,46,161,102,40,217,36,178 +.byte 118,91,162,73,109,139,209,37 +.byte 114,248,246,100,134,104,152,22 +.byte 212,164,92,204,93,101,182,146 +.byte 108,112,72,80,253,237,185,218 +.byte 94,21,70,87,167,141,157,132 +.byte 144,216,171,0,140,188,211,10 +.byte 247,228,88,5,184,179,69,6 +.byte 208,44,30,143,202,63,15,2 +.byte 193,175,189,3,1,19,138,107 +.byte 58,145,17,65,79,103,220,234 +.byte 151,242,207,206,240,180,230,115 +.byte 150,172,116,34,231,173,53,133 +.byte 226,249,55,232,28,117,223,110 +.byte 71,241,26,113,29,41,197,137 +.byte 111,183,98,14,170,24,190,27 +.byte 252,86,62,75,198,210,121,32 +.byte 154,219,192,254,120,205,90,244 +.byte 31,221,168,51,136,7,199,49 +.byte 177,18,16,89,39,128,236,95 +.byte 96,81,127,169,25,181,74,13 +.byte 45,229,122,159,147,201,156,239 +.byte 160,224,59,77,174,42,245,176 +.byte 200,235,187,60,131,83,153,97 +.byte 23,43,4,126,186,119,214,38 +.byte 225,105,20,99,85,33,12,125 +.byte 82,9,106,213,48,54,165,56 +.byte 191,64,163,158,129,243,215,251 +.byte 124,227,57,130,155,47,255,135 +.byte 52,142,67,68,196,222,233,203 +.byte 84,123,148,50,166,194,35,61 +.byte 238,76,149,11,66,250,195,78 +.byte 8,46,161,102,40,217,36,178 +.byte 118,91,162,73,109,139,209,37 +.byte 114,248,246,100,134,104,152,22 +.byte 212,164,92,204,93,101,182,146 +.byte 108,112,72,80,253,237,185,218 +.byte 94,21,70,87,167,141,157,132 +.byte 144,216,171,0,140,188,211,10 +.byte 247,228,88,5,184,179,69,6 +.byte 208,44,30,143,202,63,15,2 +.byte 193,175,189,3,1,19,138,107 +.byte 58,145,17,65,79,103,220,234 +.byte 151,242,207,206,240,180,230,115 +.byte 150,172,116,34,231,173,53,133 +.byte 226,249,55,232,28,117,223,110 +.byte 71,241,26,113,29,41,197,137 +.byte 111,183,98,14,170,24,190,27 +.byte 252,86,62,75,198,210,121,32 +.byte 154,219,192,254,120,205,90,244 +.byte 31,221,168,51,136,7,199,49 +.byte 177,18,16,89,39,128,236,95 +.byte 96,81,127,169,25,181,74,13 +.byte 45,229,122,159,147,201,156,239 +.byte 160,224,59,77,174,42,245,176 +.byte 200,235,187,60,131,83,153,97 +.byte 23,43,4,126,186,119,214,38 +.byte 225,105,20,99,85,33,12,125 +.byte 82,9,106,213,48,54,165,56 +.byte 191,64,163,158,129,243,215,251 +.byte 124,227,57,130,155,47,255,135 +.byte 52,142,67,68,196,222,233,203 +.byte 84,123,148,50,166,194,35,61 +.byte 238,76,149,11,66,250,195,78 +.byte 8,46,161,102,40,217,36,178 +.byte 118,91,162,73,109,139,209,37 +.byte 114,248,246,100,134,104,152,22 +.byte 212,164,92,204,93,101,182,146 +.byte 108,112,72,80,253,237,185,218 +.byte 94,21,70,87,167,141,157,132 +.byte 144,216,171,0,140,188,211,10 +.byte 247,228,88,5,184,179,69,6 +.byte 208,44,30,143,202,63,15,2 +.byte 193,175,189,3,1,19,138,107 +.byte 58,145,17,65,79,103,220,234 +.byte 151,242,207,206,240,180,230,115 +.byte 150,172,116,34,231,173,53,133 +.byte 226,249,55,232,28,117,223,110 +.byte 71,241,26,113,29,41,197,137 +.byte 111,183,98,14,170,24,190,27 +.byte 252,86,62,75,198,210,121,32 +.byte 154,219,192,254,120,205,90,244 +.byte 31,221,168,51,136,7,199,49 +.byte 177,18,16,89,39,128,236,95 +.byte 96,81,127,169,25,181,74,13 +.byte 45,229,122,159,147,201,156,239 +.byte 160,224,59,77,174,42,245,176 +.byte 200,235,187,60,131,83,153,97 +.byte 23,43,4,126,186,119,214,38 +.byte 225,105,20,99,85,33,12,125 +.size _x86_AES_decrypt,.-_x86_AES_decrypt +.globl AES_decrypt +.type AES_decrypt,@function +.align 16 +AES_decrypt: +.L_AES_decrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%esi + movl 28(%esp),%edi + movl %esp,%eax + subl $36,%esp + andl $-64,%esp + leal -127(%edi),%ebx + subl %esp,%ebx + negl %ebx + andl $960,%ebx + subl %ebx,%esp + addl $4,%esp + movl %eax,28(%esp) + call .L010pic_point +.L010pic_point: + popl %ebp + leal OPENSSL_ia32cap_P,%eax + leal .LAES_Td-.L010pic_point(%ebp),%ebp + leal 764(%esp),%ebx + subl %ebp,%ebx + andl $768,%ebx + leal 2176(%ebp,%ebx,1),%ebp + btl $25,(%eax) + jnc .L011x86 + movq (%esi),%mm0 + movq 8(%esi),%mm4 + call _sse_AES_decrypt_compact + movl 28(%esp),%esp + movl 24(%esp),%esi + movq %mm0,(%esi) + movq %mm4,8(%esi) + emms + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.align 16 +.L011x86: + movl %ebp,24(%esp) + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edx + call _x86_AES_decrypt_compact + movl 28(%esp),%esp + movl 24(%esp),%esi + movl %eax,(%esi) + movl %ebx,4(%esi) + movl %ecx,8(%esi) + movl %edx,12(%esi) + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size AES_decrypt,.-.L_AES_decrypt_begin +.globl AES_cbc_encrypt +.type AES_cbc_encrypt,@function +.align 16 +AES_cbc_encrypt: +.L_AES_cbc_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 28(%esp),%ecx + cmpl $0,%ecx + je .L012drop_out + call .L013pic_point +.L013pic_point: + popl %ebp + leal OPENSSL_ia32cap_P,%eax + cmpl $0,40(%esp) + leal .LAES_Te-.L013pic_point(%ebp),%ebp + jne .L014picked_te + leal .LAES_Td-.LAES_Te(%ebp),%ebp +.L014picked_te: + pushfl + cld + cmpl $512,%ecx + jb .L015slow_way + testl $15,%ecx + jnz .L015slow_way + btl $28,(%eax) + jc .L015slow_way + leal -324(%esp),%esi + andl $-64,%esi + movl %ebp,%eax + leal 2304(%ebp),%ebx + movl %esi,%edx + andl $4095,%eax + andl $4095,%ebx + andl $4095,%edx + cmpl %ebx,%edx + jb .L016tbl_break_out + subl %ebx,%edx + subl %edx,%esi + jmp .L017tbl_ok +.align 4 +.L016tbl_break_out: + subl %eax,%edx + andl $4095,%edx + addl $384,%edx + subl %edx,%esi +.align 4 +.L017tbl_ok: + leal 24(%esp),%edx + xchgl %esi,%esp + addl $4,%esp + movl %ebp,24(%esp) + movl %esi,28(%esp) + movl (%edx),%eax + movl 4(%edx),%ebx + movl 12(%edx),%edi + movl 16(%edx),%esi + movl 20(%edx),%edx + movl %eax,32(%esp) + movl %ebx,36(%esp) + movl %ecx,40(%esp) + movl %edi,44(%esp) + movl %esi,48(%esp) + movl $0,316(%esp) + movl %edi,%ebx + movl $61,%ecx + subl %ebp,%ebx + movl %edi,%esi + andl $4095,%ebx + leal 76(%esp),%edi + cmpl $2304,%ebx + jb .L018do_copy + cmpl $3852,%ebx + jb .L019skip_copy +.align 4 +.L018do_copy: + movl %edi,44(%esp) +.long 2784229001 +.L019skip_copy: + movl $16,%edi +.align 4 +.L020prefetch_tbl: + movl (%ebp),%eax + movl 32(%ebp),%ebx + movl 64(%ebp),%ecx + movl 96(%ebp),%esi + leal 128(%ebp),%ebp + subl $1,%edi + jnz .L020prefetch_tbl + subl $2048,%ebp + movl 32(%esp),%esi + movl 48(%esp),%edi + cmpl $0,%edx + je .L021fast_decrypt + movl (%edi),%eax + movl 4(%edi),%ebx +.align 16 +.L022fast_enc_loop: + movl 8(%edi),%ecx + movl 12(%edi),%edx + xorl (%esi),%eax + xorl 4(%esi),%ebx + xorl 8(%esi),%ecx + xorl 12(%esi),%edx + movl 44(%esp),%edi + call _x86_AES_encrypt + movl 32(%esp),%esi + movl 36(%esp),%edi + movl %eax,(%edi) + movl %ebx,4(%edi) + movl %ecx,8(%edi) + movl %edx,12(%edi) + leal 16(%esi),%esi + movl 40(%esp),%ecx + movl %esi,32(%esp) + leal 16(%edi),%edx + movl %edx,36(%esp) + subl $16,%ecx + movl %ecx,40(%esp) + jnz .L022fast_enc_loop + movl 48(%esp),%esi + movl 8(%edi),%ecx + movl 12(%edi),%edx + movl %eax,(%esi) + movl %ebx,4(%esi) + movl %ecx,8(%esi) + movl %edx,12(%esi) + cmpl $0,316(%esp) + movl 44(%esp),%edi + je .L023skip_ezero + movl $60,%ecx + xorl %eax,%eax +.align 4 +.long 2884892297 +.L023skip_ezero: + movl 28(%esp),%esp + popfl +.L012drop_out: + popl %edi + popl %esi + popl %ebx + popl %ebp + ret + pushfl +.align 16 +.L021fast_decrypt: + cmpl 36(%esp),%esi + je .L024fast_dec_in_place + movl %edi,52(%esp) +.align 4 +.align 16 +.L025fast_dec_loop: + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edx + movl 44(%esp),%edi + call _x86_AES_decrypt + movl 52(%esp),%edi + movl 40(%esp),%esi + xorl (%edi),%eax + xorl 4(%edi),%ebx + xorl 8(%edi),%ecx + xorl 12(%edi),%edx + movl 36(%esp),%edi + movl 32(%esp),%esi + movl %eax,(%edi) + movl %ebx,4(%edi) + movl %ecx,8(%edi) + movl %edx,12(%edi) + movl 40(%esp),%ecx + movl %esi,52(%esp) + leal 16(%esi),%esi + movl %esi,32(%esp) + leal 16(%edi),%edi + movl %edi,36(%esp) + subl $16,%ecx + movl %ecx,40(%esp) + jnz .L025fast_dec_loop + movl 52(%esp),%edi + movl 48(%esp),%esi + movl (%edi),%eax + movl 4(%edi),%ebx + movl 8(%edi),%ecx + movl 12(%edi),%edx + movl %eax,(%esi) + movl %ebx,4(%esi) + movl %ecx,8(%esi) + movl %edx,12(%esi) + jmp .L026fast_dec_out +.align 16 +.L024fast_dec_in_place: +.L027fast_dec_in_place_loop: + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edx + leal 60(%esp),%edi + movl %eax,(%edi) + movl %ebx,4(%edi) + movl %ecx,8(%edi) + movl %edx,12(%edi) + movl 44(%esp),%edi + call _x86_AES_decrypt + movl 48(%esp),%edi + movl 36(%esp),%esi + xorl (%edi),%eax + xorl 4(%edi),%ebx + xorl 8(%edi),%ecx + xorl 12(%edi),%edx + movl %eax,(%esi) + movl %ebx,4(%esi) + movl %ecx,8(%esi) + movl %edx,12(%esi) + leal 16(%esi),%esi + movl %esi,36(%esp) + leal 60(%esp),%esi + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edx + movl %eax,(%edi) + movl %ebx,4(%edi) + movl %ecx,8(%edi) + movl %edx,12(%edi) + movl 32(%esp),%esi + movl 40(%esp),%ecx + leal 16(%esi),%esi + movl %esi,32(%esp) + subl $16,%ecx + movl %ecx,40(%esp) + jnz .L027fast_dec_in_place_loop +.align 4 +.L026fast_dec_out: + cmpl $0,316(%esp) + movl 44(%esp),%edi + je .L028skip_dzero + movl $60,%ecx + xorl %eax,%eax +.align 4 +.long 2884892297 +.L028skip_dzero: + movl 28(%esp),%esp + popfl + popl %edi + popl %esi + popl %ebx + popl %ebp + ret + pushfl +.align 16 +.L015slow_way: + movl (%eax),%eax + movl 36(%esp),%edi + leal -80(%esp),%esi + andl $-64,%esi + leal -143(%edi),%ebx + subl %esi,%ebx + negl %ebx + andl $960,%ebx + subl %ebx,%esi + leal 768(%esi),%ebx + subl %ebp,%ebx + andl $768,%ebx + leal 2176(%ebp,%ebx,1),%ebp + leal 24(%esp),%edx + xchgl %esi,%esp + addl $4,%esp + movl %ebp,24(%esp) + movl %esi,28(%esp) + movl %eax,52(%esp) + movl (%edx),%eax + movl 4(%edx),%ebx + movl 16(%edx),%esi + movl 20(%edx),%edx + movl %eax,32(%esp) + movl %ebx,36(%esp) + movl %ecx,40(%esp) + movl %edi,44(%esp) + movl %esi,48(%esp) + movl %esi,%edi + movl %eax,%esi + cmpl $0,%edx + je .L029slow_decrypt + cmpl $16,%ecx + movl %ebx,%edx + jb .L030slow_enc_tail + btl $25,52(%esp) + jnc .L031slow_enc_x86 + movq (%edi),%mm0 + movq 8(%edi),%mm4 +.align 16 +.L032slow_enc_loop_sse: + pxor (%esi),%mm0 + pxor 8(%esi),%mm4 + movl 44(%esp),%edi + call _sse_AES_encrypt_compact + movl 32(%esp),%esi + movl 36(%esp),%edi + movl 40(%esp),%ecx + movq %mm0,(%edi) + movq %mm4,8(%edi) + leal 16(%esi),%esi + movl %esi,32(%esp) + leal 16(%edi),%edx + movl %edx,36(%esp) + subl $16,%ecx + cmpl $16,%ecx + movl %ecx,40(%esp) + jae .L032slow_enc_loop_sse + testl $15,%ecx + jnz .L030slow_enc_tail + movl 48(%esp),%esi + movq %mm0,(%esi) + movq %mm4,8(%esi) + emms + movl 28(%esp),%esp + popfl + popl %edi + popl %esi + popl %ebx + popl %ebp + ret + pushfl +.align 16 +.L031slow_enc_x86: + movl (%edi),%eax + movl 4(%edi),%ebx +.align 4 +.L033slow_enc_loop_x86: + movl 8(%edi),%ecx + movl 12(%edi),%edx + xorl (%esi),%eax + xorl 4(%esi),%ebx + xorl 8(%esi),%ecx + xorl 12(%esi),%edx + movl 44(%esp),%edi + call _x86_AES_encrypt_compact + movl 32(%esp),%esi + movl 36(%esp),%edi + movl %eax,(%edi) + movl %ebx,4(%edi) + movl %ecx,8(%edi) + movl %edx,12(%edi) + movl 40(%esp),%ecx + leal 16(%esi),%esi + movl %esi,32(%esp) + leal 16(%edi),%edx + movl %edx,36(%esp) + subl $16,%ecx + cmpl $16,%ecx + movl %ecx,40(%esp) + jae .L033slow_enc_loop_x86 + testl $15,%ecx + jnz .L030slow_enc_tail + movl 48(%esp),%esi + movl 8(%edi),%ecx + movl 12(%edi),%edx + movl %eax,(%esi) + movl %ebx,4(%esi) + movl %ecx,8(%esi) + movl %edx,12(%esi) + movl 28(%esp),%esp + popfl + popl %edi + popl %esi + popl %ebx + popl %ebp + ret + pushfl +.align 16 +.L030slow_enc_tail: + emms + movl %edx,%edi + movl $16,%ebx + subl %ecx,%ebx + cmpl %esi,%edi + je .L034enc_in_place +.align 4 +.long 2767451785 + jmp .L035enc_skip_in_place +.L034enc_in_place: + leal (%edi,%ecx,1),%edi +.L035enc_skip_in_place: + movl %ebx,%ecx + xorl %eax,%eax +.align 4 +.long 2868115081 + movl 48(%esp),%edi + movl %edx,%esi + movl (%edi),%eax + movl 4(%edi),%ebx + movl $16,40(%esp) + jmp .L033slow_enc_loop_x86 +.align 16 +.L029slow_decrypt: + btl $25,52(%esp) + jnc .L036slow_dec_loop_x86 +.align 4 +.L037slow_dec_loop_sse: + movq (%esi),%mm0 + movq 8(%esi),%mm4 + movl 44(%esp),%edi + call _sse_AES_decrypt_compact + movl 32(%esp),%esi + leal 60(%esp),%eax + movl 36(%esp),%ebx + movl 40(%esp),%ecx + movl 48(%esp),%edi + movq (%esi),%mm1 + movq 8(%esi),%mm5 + pxor (%edi),%mm0 + pxor 8(%edi),%mm4 + movq %mm1,(%edi) + movq %mm5,8(%edi) + subl $16,%ecx + jc .L038slow_dec_partial_sse + movq %mm0,(%ebx) + movq %mm4,8(%ebx) + leal 16(%ebx),%ebx + movl %ebx,36(%esp) + leal 16(%esi),%esi + movl %esi,32(%esp) + movl %ecx,40(%esp) + jnz .L037slow_dec_loop_sse + emms + movl 28(%esp),%esp + popfl + popl %edi + popl %esi + popl %ebx + popl %ebp + ret + pushfl +.align 16 +.L038slow_dec_partial_sse: + movq %mm0,(%eax) + movq %mm4,8(%eax) + emms + addl $16,%ecx + movl %ebx,%edi + movl %eax,%esi +.align 4 +.long 2767451785 + movl 28(%esp),%esp + popfl + popl %edi + popl %esi + popl %ebx + popl %ebp + ret + pushfl +.align 16 +.L036slow_dec_loop_x86: + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edx + leal 60(%esp),%edi + movl %eax,(%edi) + movl %ebx,4(%edi) + movl %ecx,8(%edi) + movl %edx,12(%edi) + movl 44(%esp),%edi + call _x86_AES_decrypt_compact + movl 48(%esp),%edi + movl 40(%esp),%esi + xorl (%edi),%eax + xorl 4(%edi),%ebx + xorl 8(%edi),%ecx + xorl 12(%edi),%edx + subl $16,%esi + jc .L039slow_dec_partial_x86 + movl %esi,40(%esp) + movl 36(%esp),%esi + movl %eax,(%esi) + movl %ebx,4(%esi) + movl %ecx,8(%esi) + movl %edx,12(%esi) + leal 16(%esi),%esi + movl %esi,36(%esp) + leal 60(%esp),%esi + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edx + movl %eax,(%edi) + movl %ebx,4(%edi) + movl %ecx,8(%edi) + movl %edx,12(%edi) + movl 32(%esp),%esi + leal 16(%esi),%esi + movl %esi,32(%esp) + jnz .L036slow_dec_loop_x86 + movl 28(%esp),%esp + popfl + popl %edi + popl %esi + popl %ebx + popl %ebp + ret + pushfl +.align 16 +.L039slow_dec_partial_x86: + leal 60(%esp),%esi + movl %eax,(%esi) + movl %ebx,4(%esi) + movl %ecx,8(%esi) + movl %edx,12(%esi) + movl 32(%esp),%esi + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edx + movl %eax,(%edi) + movl %ebx,4(%edi) + movl %ecx,8(%edi) + movl %edx,12(%edi) + movl 40(%esp),%ecx + movl 36(%esp),%edi + leal 60(%esp),%esi +.align 4 +.long 2767451785 + movl 28(%esp),%esp + popfl + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size AES_cbc_encrypt,.-.L_AES_cbc_encrypt_begin +.type _x86_AES_set_encrypt_key,@function +.align 16 +_x86_AES_set_encrypt_key: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 24(%esp),%esi + movl 32(%esp),%edi + testl $-1,%esi + jz .L040badpointer + testl $-1,%edi + jz .L040badpointer + call .L041pic_point +.L041pic_point: + popl %ebp + leal .LAES_Te-.L041pic_point(%ebp),%ebp + leal 2176(%ebp),%ebp + movl -128(%ebp),%eax + movl -96(%ebp),%ebx + movl -64(%ebp),%ecx + movl -32(%ebp),%edx + movl (%ebp),%eax + movl 32(%ebp),%ebx + movl 64(%ebp),%ecx + movl 96(%ebp),%edx + movl 28(%esp),%ecx + cmpl $128,%ecx + je .L04210rounds + cmpl $192,%ecx + je .L04312rounds + cmpl $256,%ecx + je .L04414rounds + movl $-2,%eax + jmp .L045exit +.L04210rounds: + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edx + movl %eax,(%edi) + movl %ebx,4(%edi) + movl %ecx,8(%edi) + movl %edx,12(%edi) + xorl %ecx,%ecx + jmp .L04610shortcut +.align 4 +.L04710loop: + movl (%edi),%eax + movl 12(%edi),%edx +.L04610shortcut: + movzbl %dl,%esi + movzbl -128(%ebp,%esi,1),%ebx + movzbl %dh,%esi + shll $24,%ebx + xorl %ebx,%eax + movzbl -128(%ebp,%esi,1),%ebx + shrl $16,%edx + movzbl %dl,%esi + xorl %ebx,%eax + movzbl -128(%ebp,%esi,1),%ebx + movzbl %dh,%esi + shll $8,%ebx + xorl %ebx,%eax + movzbl -128(%ebp,%esi,1),%ebx + shll $16,%ebx + xorl %ebx,%eax + xorl 896(%ebp,%ecx,4),%eax + movl %eax,16(%edi) + xorl 4(%edi),%eax + movl %eax,20(%edi) + xorl 8(%edi),%eax + movl %eax,24(%edi) + xorl 12(%edi),%eax + movl %eax,28(%edi) + incl %ecx + addl $16,%edi + cmpl $10,%ecx + jl .L04710loop + movl $10,80(%edi) + xorl %eax,%eax + jmp .L045exit +.L04312rounds: + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edx + movl %eax,(%edi) + movl %ebx,4(%edi) + movl %ecx,8(%edi) + movl %edx,12(%edi) + movl 16(%esi),%ecx + movl 20(%esi),%edx + movl %ecx,16(%edi) + movl %edx,20(%edi) + xorl %ecx,%ecx + jmp .L04812shortcut +.align 4 +.L04912loop: + movl (%edi),%eax + movl 20(%edi),%edx +.L04812shortcut: + movzbl %dl,%esi + movzbl -128(%ebp,%esi,1),%ebx + movzbl %dh,%esi + shll $24,%ebx + xorl %ebx,%eax + movzbl -128(%ebp,%esi,1),%ebx + shrl $16,%edx + movzbl %dl,%esi + xorl %ebx,%eax + movzbl -128(%ebp,%esi,1),%ebx + movzbl %dh,%esi + shll $8,%ebx + xorl %ebx,%eax + movzbl -128(%ebp,%esi,1),%ebx + shll $16,%ebx + xorl %ebx,%eax + xorl 896(%ebp,%ecx,4),%eax + movl %eax,24(%edi) + xorl 4(%edi),%eax + movl %eax,28(%edi) + xorl 8(%edi),%eax + movl %eax,32(%edi) + xorl 12(%edi),%eax + movl %eax,36(%edi) + cmpl $7,%ecx + je .L05012break + incl %ecx + xorl 16(%edi),%eax + movl %eax,40(%edi) + xorl 20(%edi),%eax + movl %eax,44(%edi) + addl $24,%edi + jmp .L04912loop +.L05012break: + movl $12,72(%edi) + xorl %eax,%eax + jmp .L045exit +.L04414rounds: + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edx + movl %eax,(%edi) + movl %ebx,4(%edi) + movl %ecx,8(%edi) + movl %edx,12(%edi) + movl 16(%esi),%eax + movl 20(%esi),%ebx + movl 24(%esi),%ecx + movl 28(%esi),%edx + movl %eax,16(%edi) + movl %ebx,20(%edi) + movl %ecx,24(%edi) + movl %edx,28(%edi) + xorl %ecx,%ecx + jmp .L05114shortcut +.align 4 +.L05214loop: + movl 28(%edi),%edx +.L05114shortcut: + movl (%edi),%eax + movzbl %dl,%esi + movzbl -128(%ebp,%esi,1),%ebx + movzbl %dh,%esi + shll $24,%ebx + xorl %ebx,%eax + movzbl -128(%ebp,%esi,1),%ebx + shrl $16,%edx + movzbl %dl,%esi + xorl %ebx,%eax + movzbl -128(%ebp,%esi,1),%ebx + movzbl %dh,%esi + shll $8,%ebx + xorl %ebx,%eax + movzbl -128(%ebp,%esi,1),%ebx + shll $16,%ebx + xorl %ebx,%eax + xorl 896(%ebp,%ecx,4),%eax + movl %eax,32(%edi) + xorl 4(%edi),%eax + movl %eax,36(%edi) + xorl 8(%edi),%eax + movl %eax,40(%edi) + xorl 12(%edi),%eax + movl %eax,44(%edi) + cmpl $6,%ecx + je .L05314break + incl %ecx + movl %eax,%edx + movl 16(%edi),%eax + movzbl %dl,%esi + movzbl -128(%ebp,%esi,1),%ebx + movzbl %dh,%esi + xorl %ebx,%eax + movzbl -128(%ebp,%esi,1),%ebx + shrl $16,%edx + shll $8,%ebx + movzbl %dl,%esi + xorl %ebx,%eax + movzbl -128(%ebp,%esi,1),%ebx + movzbl %dh,%esi + shll $16,%ebx + xorl %ebx,%eax + movzbl -128(%ebp,%esi,1),%ebx + shll $24,%ebx + xorl %ebx,%eax + movl %eax,48(%edi) + xorl 20(%edi),%eax + movl %eax,52(%edi) + xorl 24(%edi),%eax + movl %eax,56(%edi) + xorl 28(%edi),%eax + movl %eax,60(%edi) + addl $32,%edi + jmp .L05214loop +.L05314break: + movl $14,48(%edi) + xorl %eax,%eax + jmp .L045exit +.L040badpointer: + movl $-1,%eax +.L045exit: + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size _x86_AES_set_encrypt_key,.-_x86_AES_set_encrypt_key +.globl AES_set_encrypt_key +.type AES_set_encrypt_key,@function +.align 16 +AES_set_encrypt_key: +.L_AES_set_encrypt_key_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + + call _x86_AES_set_encrypt_key + ret +.size AES_set_encrypt_key,.-.L_AES_set_encrypt_key_begin +.globl AES_set_decrypt_key +.type AES_set_decrypt_key,@function +.align 16 +AES_set_decrypt_key: +.L_AES_set_decrypt_key_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + + call _x86_AES_set_encrypt_key + cmpl $0,%eax + je .L054proceed + ret +.L054proceed: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 28(%esp),%esi + movl 240(%esi),%ecx + leal (,%ecx,4),%ecx + leal (%esi,%ecx,4),%edi +.align 4 +.L055invert: + movl (%esi),%eax + movl 4(%esi),%ebx + movl (%edi),%ecx + movl 4(%edi),%edx + movl %eax,(%edi) + movl %ebx,4(%edi) + movl %ecx,(%esi) + movl %edx,4(%esi) + movl 8(%esi),%eax + movl 12(%esi),%ebx + movl 8(%edi),%ecx + movl 12(%edi),%edx + movl %eax,8(%edi) + movl %ebx,12(%edi) + movl %ecx,8(%esi) + movl %edx,12(%esi) + addl $16,%esi + subl $16,%edi + cmpl %edi,%esi + jne .L055invert + movl 28(%esp),%edi + movl 240(%edi),%esi + leal -2(%esi,%esi,1),%esi + leal (%edi,%esi,8),%esi + movl %esi,28(%esp) + movl 16(%edi),%eax +.align 4 +.L056permute: + addl $16,%edi + movl $2155905152,%ebp + andl %eax,%ebp + leal (%eax,%eax,1),%ebx + movl %ebp,%esi + shrl $7,%ebp + subl %ebp,%esi + andl $4278124286,%ebx + andl $454761243,%esi + xorl %esi,%ebx + movl $2155905152,%ebp + andl %ebx,%ebp + leal (%ebx,%ebx,1),%ecx + movl %ebp,%esi + shrl $7,%ebp + subl %ebp,%esi + andl $4278124286,%ecx + andl $454761243,%esi + xorl %eax,%ebx + xorl %esi,%ecx + movl $2155905152,%ebp + andl %ecx,%ebp + leal (%ecx,%ecx,1),%edx + movl %ebp,%esi + shrl $7,%ebp + xorl %eax,%ecx + subl %ebp,%esi + andl $4278124286,%edx + andl $454761243,%esi + roll $8,%eax + xorl %esi,%edx + movl 4(%edi),%ebp + xorl %ebx,%eax + xorl %edx,%ebx + xorl %ecx,%eax + roll $24,%ebx + xorl %edx,%ecx + xorl %edx,%eax + roll $16,%ecx + xorl %ebx,%eax + roll $8,%edx + xorl %ecx,%eax + movl %ebp,%ebx + xorl %edx,%eax + movl %eax,(%edi) + movl $2155905152,%ebp + andl %ebx,%ebp + leal (%ebx,%ebx,1),%ecx + movl %ebp,%esi + shrl $7,%ebp + subl %ebp,%esi + andl $4278124286,%ecx + andl $454761243,%esi + xorl %esi,%ecx + movl $2155905152,%ebp + andl %ecx,%ebp + leal (%ecx,%ecx,1),%edx + movl %ebp,%esi + shrl $7,%ebp + subl %ebp,%esi + andl $4278124286,%edx + andl $454761243,%esi + xorl %ebx,%ecx + xorl %esi,%edx + movl $2155905152,%ebp + andl %edx,%ebp + leal (%edx,%edx,1),%eax + movl %ebp,%esi + shrl $7,%ebp + xorl %ebx,%edx + subl %ebp,%esi + andl $4278124286,%eax + andl $454761243,%esi + roll $8,%ebx + xorl %esi,%eax + movl 8(%edi),%ebp + xorl %ecx,%ebx + xorl %eax,%ecx + xorl %edx,%ebx + roll $24,%ecx + xorl %eax,%edx + xorl %eax,%ebx + roll $16,%edx + xorl %ecx,%ebx + roll $8,%eax + xorl %edx,%ebx + movl %ebp,%ecx + xorl %eax,%ebx + movl %ebx,4(%edi) + movl $2155905152,%ebp + andl %ecx,%ebp + leal (%ecx,%ecx,1),%edx + movl %ebp,%esi + shrl $7,%ebp + subl %ebp,%esi + andl $4278124286,%edx + andl $454761243,%esi + xorl %esi,%edx + movl $2155905152,%ebp + andl %edx,%ebp + leal (%edx,%edx,1),%eax + movl %ebp,%esi + shrl $7,%ebp + subl %ebp,%esi + andl $4278124286,%eax + andl $454761243,%esi + xorl %ecx,%edx + xorl %esi,%eax + movl $2155905152,%ebp + andl %eax,%ebp + leal (%eax,%eax,1),%ebx + movl %ebp,%esi + shrl $7,%ebp + xorl %ecx,%eax + subl %ebp,%esi + andl $4278124286,%ebx + andl $454761243,%esi + roll $8,%ecx + xorl %esi,%ebx + movl 12(%edi),%ebp + xorl %edx,%ecx + xorl %ebx,%edx + xorl %eax,%ecx + roll $24,%edx + xorl %ebx,%eax + xorl %ebx,%ecx + roll $16,%eax + xorl %edx,%ecx + roll $8,%ebx + xorl %eax,%ecx + movl %ebp,%edx + xorl %ebx,%ecx + movl %ecx,8(%edi) + movl $2155905152,%ebp + andl %edx,%ebp + leal (%edx,%edx,1),%eax + movl %ebp,%esi + shrl $7,%ebp + subl %ebp,%esi + andl $4278124286,%eax + andl $454761243,%esi + xorl %esi,%eax + movl $2155905152,%ebp + andl %eax,%ebp + leal (%eax,%eax,1),%ebx + movl %ebp,%esi + shrl $7,%ebp + subl %ebp,%esi + andl $4278124286,%ebx + andl $454761243,%esi + xorl %edx,%eax + xorl %esi,%ebx + movl $2155905152,%ebp + andl %ebx,%ebp + leal (%ebx,%ebx,1),%ecx + movl %ebp,%esi + shrl $7,%ebp + xorl %edx,%ebx + subl %ebp,%esi + andl $4278124286,%ecx + andl $454761243,%esi + roll $8,%edx + xorl %esi,%ecx + movl 16(%edi),%ebp + xorl %eax,%edx + xorl %ecx,%eax + xorl %ebx,%edx + roll $24,%eax + xorl %ecx,%ebx + xorl %ecx,%edx + roll $16,%ebx + xorl %eax,%edx + roll $8,%ecx + xorl %ebx,%edx + movl %ebp,%eax + xorl %ecx,%edx + movl %edx,12(%edi) + cmpl 28(%esp),%edi + jb .L056permute + xorl %eax,%eax + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size AES_set_decrypt_key,.-.L_AES_set_decrypt_key_begin +.byte 65,69,83,32,102,111,114,32,120,56,54,44,32,67,82,89 +.byte 80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114 +.byte 111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.comm OPENSSL_ia32cap_P,16,4 + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: +#endif diff --git a/sys/crypto/openssl/i386/aesni-x86.S b/sys/crypto/openssl/i386/aesni-x86.S --- a/sys/crypto/openssl/i386/aesni-x86.S +++ b/sys/crypto/openssl/i386/aesni-x86.S @@ -6,6 +6,11 @@ .align 16 aesni_encrypt: .L_aesni_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl 4(%esp),%eax movl 12(%esp),%edx movups (%eax),%xmm2 @@ -33,6 +38,11 @@ .align 16 aesni_decrypt: .L_aesni_decrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl 4(%esp),%eax movl 12(%esp),%edx movups (%eax),%xmm2 @@ -58,6 +68,11 @@ .type _aesni_encrypt2,@function .align 16 _aesni_encrypt2: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movups (%edx),%xmm0 shll $4,%ecx movups 16(%edx),%xmm1 @@ -85,6 +100,11 @@ .type _aesni_decrypt2,@function .align 16 _aesni_decrypt2: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movups (%edx),%xmm0 shll $4,%ecx movups 16(%edx),%xmm1 @@ -112,6 +132,11 @@ .type _aesni_encrypt3,@function .align 16 _aesni_encrypt3: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movups (%edx),%xmm0 shll $4,%ecx movups 16(%edx),%xmm1 @@ -144,6 +169,11 @@ .type _aesni_decrypt3,@function .align 16 _aesni_decrypt3: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movups (%edx),%xmm0 shll $4,%ecx movups 16(%edx),%xmm1 @@ -176,6 +206,11 @@ .type _aesni_encrypt4,@function .align 16 _aesni_encrypt4: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movups (%edx),%xmm0 movups 16(%edx),%xmm1 shll $4,%ecx @@ -214,6 +249,11 @@ .type _aesni_decrypt4,@function .align 16 _aesni_decrypt4: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movups (%edx),%xmm0 movups 16(%edx),%xmm1 shll $4,%ecx @@ -252,6 +292,11 @@ .type _aesni_encrypt6,@function .align 16 _aesni_encrypt6: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movups (%edx),%xmm0 shll $4,%ecx movups 16(%edx),%xmm1 @@ -306,6 +351,11 @@ .type _aesni_decrypt6,@function .align 16 _aesni_decrypt6: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movups (%edx),%xmm0 shll $4,%ecx movups 16(%edx),%xmm1 @@ -362,6 +412,11 @@ .align 16 aesni_ecb_encrypt: .L_aesni_ecb_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -597,6 +652,11 @@ .align 16 aesni_ccm64_encrypt_blocks: .L_aesni_ccm64_encrypt_blocks_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -685,6 +745,11 @@ .align 16 aesni_ccm64_decrypt_blocks: .L_aesni_ccm64_decrypt_blocks_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -808,6 +873,11 @@ .align 16 aesni_ctr32_encrypt_blocks: .L_aesni_ctr32_encrypt_blocks_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -1046,6 +1116,11 @@ .align 16 aesni_xts_encrypt: .L_aesni_xts_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -1406,6 +1481,11 @@ .align 16 aesni_xts_decrypt: .L_aesni_xts_decrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -1796,6 +1876,11 @@ .align 16 aesni_ocb_encrypt: .L_aesni_ocb_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -2191,6 +2276,11 @@ .align 16 aesni_ocb_decrypt: .L_aesni_ocb_decrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -2586,6 +2676,11 @@ .align 16 aesni_cbc_encrypt: .L_aesni_cbc_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -2845,6 +2940,11 @@ .type _aesni_set_encrypt_key,@function .align 16 _aesni_set_encrypt_key: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx testl %eax,%eax @@ -3180,6 +3280,11 @@ .align 16 aesni_set_encrypt_key: .L_aesni_set_encrypt_key_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl 4(%esp),%eax movl 8(%esp),%ecx movl 12(%esp),%edx @@ -3191,6 +3296,11 @@ .align 16 aesni_set_decrypt_key: .L_aesni_set_decrypt_key_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl 4(%esp),%eax movl 8(%esp),%ecx movl 12(%esp),%edx @@ -3237,6 +3347,23 @@ .byte 32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115 .byte 115,108,46,111,114,103,62,0 .comm OPENSSL_ia32cap_P,16,4 + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #else .text .globl aesni_encrypt @@ -3244,6 +3371,11 @@ .align 16 aesni_encrypt: .L_aesni_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl 4(%esp),%eax movl 12(%esp),%edx movups (%eax),%xmm2 @@ -3271,6 +3403,11 @@ .align 16 aesni_decrypt: .L_aesni_decrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl 4(%esp),%eax movl 12(%esp),%edx movups (%eax),%xmm2 @@ -3296,6 +3433,11 @@ .type _aesni_encrypt2,@function .align 16 _aesni_encrypt2: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movups (%edx),%xmm0 shll $4,%ecx movups 16(%edx),%xmm1 @@ -3323,6 +3465,11 @@ .type _aesni_decrypt2,@function .align 16 _aesni_decrypt2: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movups (%edx),%xmm0 shll $4,%ecx movups 16(%edx),%xmm1 @@ -3350,6 +3497,11 @@ .type _aesni_encrypt3,@function .align 16 _aesni_encrypt3: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movups (%edx),%xmm0 shll $4,%ecx movups 16(%edx),%xmm1 @@ -3382,6 +3534,11 @@ .type _aesni_decrypt3,@function .align 16 _aesni_decrypt3: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movups (%edx),%xmm0 shll $4,%ecx movups 16(%edx),%xmm1 @@ -3414,6 +3571,11 @@ .type _aesni_encrypt4,@function .align 16 _aesni_encrypt4: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movups (%edx),%xmm0 movups 16(%edx),%xmm1 shll $4,%ecx @@ -3452,6 +3614,11 @@ .type _aesni_decrypt4,@function .align 16 _aesni_decrypt4: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movups (%edx),%xmm0 movups 16(%edx),%xmm1 shll $4,%ecx @@ -3490,6 +3657,11 @@ .type _aesni_encrypt6,@function .align 16 _aesni_encrypt6: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movups (%edx),%xmm0 shll $4,%ecx movups 16(%edx),%xmm1 @@ -3544,6 +3716,11 @@ .type _aesni_decrypt6,@function .align 16 _aesni_decrypt6: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movups (%edx),%xmm0 shll $4,%ecx movups 16(%edx),%xmm1 @@ -3600,6 +3777,11 @@ .align 16 aesni_ecb_encrypt: .L_aesni_ecb_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -3835,6 +4017,11 @@ .align 16 aesni_ccm64_encrypt_blocks: .L_aesni_ccm64_encrypt_blocks_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -3923,6 +4110,11 @@ .align 16 aesni_ccm64_decrypt_blocks: .L_aesni_ccm64_decrypt_blocks_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -4046,6 +4238,11 @@ .align 16 aesni_ctr32_encrypt_blocks: .L_aesni_ctr32_encrypt_blocks_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -4284,6 +4481,11 @@ .align 16 aesni_xts_encrypt: .L_aesni_xts_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -4644,6 +4846,11 @@ .align 16 aesni_xts_decrypt: .L_aesni_xts_decrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -5034,6 +5241,11 @@ .align 16 aesni_ocb_encrypt: .L_aesni_ocb_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -5429,6 +5641,11 @@ .align 16 aesni_ocb_decrypt: .L_aesni_ocb_decrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -5824,6 +6041,11 @@ .align 16 aesni_cbc_encrypt: .L_aesni_cbc_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -6083,6 +6305,11 @@ .type _aesni_set_encrypt_key,@function .align 16 _aesni_set_encrypt_key: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx testl %eax,%eax @@ -6418,6 +6645,11 @@ .align 16 aesni_set_encrypt_key: .L_aesni_set_encrypt_key_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl 4(%esp),%eax movl 8(%esp),%ecx movl 12(%esp),%edx @@ -6429,6 +6661,11 @@ .align 16 aesni_set_decrypt_key: .L_aesni_set_decrypt_key_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl 4(%esp),%eax movl 8(%esp),%ecx movl 12(%esp),%edx @@ -6475,4 +6712,21 @@ .byte 32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115 .byte 115,108,46,111,114,103,62,0 .comm OPENSSL_ia32cap_P,16,4 + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #endif diff --git a/sys/crypto/openssl/i386/bf-586.S b/sys/crypto/openssl/i386/bf-586.S --- a/sys/crypto/openssl/i386/bf-586.S +++ b/sys/crypto/openssl/i386/bf-586.S @@ -6,6 +6,11 @@ .align 16 BF_encrypt: .L_BF_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx @@ -356,6 +361,11 @@ .align 16 BF_decrypt: .L_BF_decrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx @@ -706,6 +716,11 @@ .align 16 BF_cbc_encrypt: .L_BF_cbc_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx @@ -769,21 +784,56 @@ xorl %edx,%edx jmp *%ebp .L006ej7: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 6(%esi),%dh shll $8,%edx .L007ej6: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 5(%esi),%dh .L008ej5: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 4(%esi),%dl .L009ej4: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl (%esi),%ecx jmp .L010ejend .L011ej3: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 2(%esi),%ch shll $8,%ecx .L012ej2: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 1(%esi),%ch .L013ej1: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb (%esi),%cl .L010ejend: xorl %ecx,%eax @@ -895,6 +945,23 @@ .long .L006ej7-.L004PIC_point .align 64 .size BF_cbc_encrypt,.-.L_BF_cbc_encrypt_begin + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #else .text .globl BF_encrypt @@ -902,6 +969,11 @@ .align 16 BF_encrypt: .L_BF_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx @@ -1252,6 +1324,11 @@ .align 16 BF_decrypt: .L_BF_decrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx @@ -1602,6 +1679,11 @@ .align 16 BF_cbc_encrypt: .L_BF_cbc_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx @@ -1665,21 +1747,56 @@ xorl %edx,%edx jmp *%ebp .L006ej7: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 6(%esi),%dh shll $8,%edx .L007ej6: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 5(%esi),%dh .L008ej5: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 4(%esi),%dl .L009ej4: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl (%esi),%ecx jmp .L010ejend .L011ej3: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 2(%esi),%ch shll $8,%ecx .L012ej2: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 1(%esi),%ch .L013ej1: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb (%esi),%cl .L010ejend: xorl %ecx,%eax @@ -1791,4 +1908,21 @@ .long .L006ej7-.L004PIC_point .align 64 .size BF_cbc_encrypt,.-.L_BF_cbc_encrypt_begin + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #endif diff --git a/sys/crypto/openssl/i386/bn-586.S b/sys/crypto/openssl/i386/bn-586.S --- a/sys/crypto/openssl/i386/bn-586.S +++ b/sys/crypto/openssl/i386/bn-586.S @@ -6,6 +6,11 @@ .align 16 bn_mul_add_words: .L_bn_mul_add_words_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + call .L000PIC_me_up .L000PIC_me_up: popl %eax @@ -289,6 +294,11 @@ .align 16 bn_mul_words: .L_bn_mul_words_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + call .L010PIC_me_up .L010PIC_me_up: popl %eax @@ -471,6 +481,11 @@ .align 16 bn_sqr_words: .L_bn_sqr_words_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + call .L017PIC_me_up .L017PIC_me_up: popl %eax @@ -612,6 +627,11 @@ .align 16 bn_div_words: .L_bn_div_words_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl 4(%esp),%edx movl 8(%esp),%eax movl 12(%esp),%ecx @@ -623,6 +643,11 @@ .align 16 bn_add_words: .L_bn_add_words_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -805,6 +830,11 @@ .align 16 bn_sub_words: .L_bn_sub_words_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -987,6 +1017,11 @@ .align 16 bn_sub_part_words: .L_bn_sub_part_words_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -1529,6 +1564,23 @@ ret .size bn_sub_part_words,.-.L_bn_sub_part_words_begin .comm OPENSSL_ia32cap_P,16,4 + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #else .text .globl bn_mul_add_words @@ -1536,6 +1588,11 @@ .align 16 bn_mul_add_words: .L_bn_mul_add_words_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + leal OPENSSL_ia32cap_P,%eax btl $26,(%eax) jnc .L000maw_non_sse2 @@ -1816,6 +1873,11 @@ .align 16 bn_mul_words: .L_bn_mul_words_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + leal OPENSSL_ia32cap_P,%eax btl $26,(%eax) jnc .L009mw_non_sse2 @@ -1995,6 +2057,11 @@ .align 16 bn_sqr_words: .L_bn_sqr_words_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + leal OPENSSL_ia32cap_P,%eax btl $26,(%eax) jnc .L015sqr_non_sse2 @@ -2133,6 +2200,11 @@ .align 16 bn_div_words: .L_bn_div_words_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl 4(%esp),%edx movl 8(%esp),%eax movl 12(%esp),%ecx @@ -2144,6 +2216,11 @@ .align 16 bn_add_words: .L_bn_add_words_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -2326,6 +2403,11 @@ .align 16 bn_sub_words: .L_bn_sub_words_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -2508,6 +2590,11 @@ .align 16 bn_sub_part_words: .L_bn_sub_part_words_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -3050,4 +3137,21 @@ ret .size bn_sub_part_words,.-.L_bn_sub_part_words_begin .comm OPENSSL_ia32cap_P,16,4 + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #endif diff --git a/sys/crypto/openssl/i386/cast-586.S b/sys/crypto/openssl/i386/cast-586.S --- a/sys/crypto/openssl/i386/cast-586.S +++ b/sys/crypto/openssl/i386/cast-586.S @@ -6,6 +6,11 @@ .align 16 CAST_encrypt: .L_CAST_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx @@ -376,6 +381,11 @@ .align 16 CAST_decrypt: .L_CAST_decrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx @@ -743,6 +753,11 @@ .align 16 CAST_cbc_encrypt: .L_CAST_cbc_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx @@ -806,21 +821,56 @@ xorl %edx,%edx jmp *%ebp .L008ej7: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 6(%esi),%dh shll $8,%edx .L009ej6: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 5(%esi),%dh .L010ej5: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 4(%esi),%dl .L011ej4: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl (%esi),%ecx jmp .L012ejend .L013ej3: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 2(%esi),%ch shll $8,%ecx .L014ej2: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 1(%esi),%ch .L015ej1: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb (%esi),%cl .L012ejend: xorl %ecx,%eax @@ -932,6 +982,23 @@ .long .L008ej7-.L006PIC_point .align 64 .size CAST_cbc_encrypt,.-.L_CAST_cbc_encrypt_begin + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #else .text .globl CAST_encrypt @@ -939,6 +1006,11 @@ .align 16 CAST_encrypt: .L_CAST_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx @@ -1309,6 +1381,11 @@ .align 16 CAST_decrypt: .L_CAST_decrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx @@ -1676,6 +1753,11 @@ .align 16 CAST_cbc_encrypt: .L_CAST_cbc_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx @@ -1739,21 +1821,56 @@ xorl %edx,%edx jmp *%ebp .L008ej7: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 6(%esi),%dh shll $8,%edx .L009ej6: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 5(%esi),%dh .L010ej5: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 4(%esi),%dl .L011ej4: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl (%esi),%ecx jmp .L012ejend .L013ej3: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 2(%esi),%ch shll $8,%ecx .L014ej2: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 1(%esi),%ch .L015ej1: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb (%esi),%cl .L012ejend: xorl %ecx,%eax @@ -1865,4 +1982,21 @@ .long .L008ej7-.L006PIC_point .align 64 .size CAST_cbc_encrypt,.-.L_CAST_cbc_encrypt_begin + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #endif diff --git a/sys/crypto/openssl/i386/chacha-x86.S b/sys/crypto/openssl/i386/chacha-x86.S --- a/sys/crypto/openssl/i386/chacha-x86.S +++ b/sys/crypto/openssl/i386/chacha-x86.S @@ -6,6 +6,11 @@ .align 16 ChaCha20_ctr32: .L_ChaCha20_ctr32_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -379,6 +384,11 @@ .align 16 ChaCha20_ssse3: .L_ChaCha20_ssse3_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -534,6 +544,11 @@ .align 16 ChaCha20_xop: .L_ChaCha20_xop_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -1008,6 +1023,23 @@ ret .size ChaCha20_xop,.-.L_ChaCha20_xop_begin .comm OPENSSL_ia32cap_P,16,4 + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #else .text .globl ChaCha20_ctr32 @@ -1015,6 +1047,11 @@ .align 16 ChaCha20_ctr32: .L_ChaCha20_ctr32_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -1388,6 +1425,11 @@ .align 16 ChaCha20_ssse3: .L_ChaCha20_ssse3_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -1543,6 +1585,11 @@ .align 16 ChaCha20_xop: .L_ChaCha20_xop_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -2017,4 +2064,21 @@ ret .size ChaCha20_xop,.-.L_ChaCha20_xop_begin .comm OPENSSL_ia32cap_P,16,4 + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #endif diff --git a/sys/crypto/openssl/i386/cmll-x86.S b/sys/crypto/openssl/i386/cmll-x86.S --- a/sys/crypto/openssl/i386/cmll-x86.S +++ b/sys/crypto/openssl/i386/cmll-x86.S @@ -6,6 +6,11 @@ .align 16 Camellia_EncryptBlock_Rounds: .L_Camellia_EncryptBlock_Rounds_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -60,6 +65,11 @@ .align 16 Camellia_EncryptBlock: .L_Camellia_EncryptBlock_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl $128,%eax subl 4(%esp),%eax movl $3,%eax @@ -72,6 +82,11 @@ .align 16 Camellia_encrypt: .L_Camellia_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -124,6 +139,11 @@ .type _x86_Camellia_encrypt,@function .align 16 _x86_Camellia_encrypt: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + xorl (%edi),%eax xorl 4(%edi),%ebx xorl 8(%edi),%ecx @@ -354,6 +374,11 @@ .align 16 Camellia_DecryptBlock_Rounds: .L_Camellia_DecryptBlock_Rounds_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -408,6 +433,11 @@ .align 16 Camellia_DecryptBlock: .L_Camellia_DecryptBlock_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl $128,%eax subl 4(%esp),%eax movl $3,%eax @@ -420,6 +450,11 @@ .align 16 Camellia_decrypt: .L_Camellia_decrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -472,6 +507,11 @@ .type _x86_Camellia_decrypt,@function .align 16 _x86_Camellia_decrypt: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + xorl (%edi),%eax xorl 4(%edi),%ebx xorl 8(%edi),%ecx @@ -702,6 +742,11 @@ .align 16 Camellia_Ekeygen: .L_Camellia_Ekeygen_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -1543,6 +1588,11 @@ .align 16 Camellia_set_key: .L_Camellia_set_key_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebx movl 8(%esp),%ecx movl 12(%esp),%ebx @@ -2095,6 +2145,11 @@ .align 16 Camellia_cbc_encrypt: .L_Camellia_cbc_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -2374,6 +2429,23 @@ .byte 67,97,109,101,108,108,105,97,32,102,111,114,32,120,56,54 .byte 32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115 .byte 115,108,46,111,114,103,62,0 + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #else .text .globl Camellia_EncryptBlock_Rounds @@ -2381,6 +2453,11 @@ .align 16 Camellia_EncryptBlock_Rounds: .L_Camellia_EncryptBlock_Rounds_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -2435,6 +2512,11 @@ .align 16 Camellia_EncryptBlock: .L_Camellia_EncryptBlock_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl $128,%eax subl 4(%esp),%eax movl $3,%eax @@ -2447,6 +2529,11 @@ .align 16 Camellia_encrypt: .L_Camellia_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -2499,6 +2586,11 @@ .type _x86_Camellia_encrypt,@function .align 16 _x86_Camellia_encrypt: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + xorl (%edi),%eax xorl 4(%edi),%ebx xorl 8(%edi),%ecx @@ -2729,6 +2821,11 @@ .align 16 Camellia_DecryptBlock_Rounds: .L_Camellia_DecryptBlock_Rounds_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -2783,6 +2880,11 @@ .align 16 Camellia_DecryptBlock: .L_Camellia_DecryptBlock_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl $128,%eax subl 4(%esp),%eax movl $3,%eax @@ -2795,6 +2897,11 @@ .align 16 Camellia_decrypt: .L_Camellia_decrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -2847,6 +2954,11 @@ .type _x86_Camellia_decrypt,@function .align 16 _x86_Camellia_decrypt: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + xorl (%edi),%eax xorl 4(%edi),%ebx xorl 8(%edi),%ecx @@ -3077,6 +3189,11 @@ .align 16 Camellia_Ekeygen: .L_Camellia_Ekeygen_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -3918,6 +4035,11 @@ .align 16 Camellia_set_key: .L_Camellia_set_key_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebx movl 8(%esp),%ecx movl 12(%esp),%ebx @@ -4470,6 +4592,11 @@ .align 16 Camellia_cbc_encrypt: .L_Camellia_cbc_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -4749,4 +4876,21 @@ .byte 67,97,109,101,108,108,105,97,32,102,111,114,32,120,56,54 .byte 32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115 .byte 115,108,46,111,114,103,62,0 + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #endif diff --git a/sys/crypto/openssl/i386/co-586.S b/sys/crypto/openssl/i386/co-586.S --- a/sys/crypto/openssl/i386/co-586.S +++ b/sys/crypto/openssl/i386/co-586.S @@ -6,6 +6,11 @@ .align 16 bn_mul_comba8: .L_bn_mul_comba8_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %esi movl 12(%esp),%esi pushl %edi @@ -550,6 +555,11 @@ .align 16 bn_mul_comba4: .L_bn_mul_comba4_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %esi movl 12(%esp),%esi pushl %edi @@ -718,6 +728,11 @@ .align 16 bn_sqr_comba8: .L_bn_sqr_comba8_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %esi pushl %edi pushl %ebp @@ -1126,6 +1141,11 @@ .align 16 bn_sqr_comba4: .L_bn_sqr_comba4_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %esi pushl %edi pushl %ebp @@ -1253,6 +1273,23 @@ popl %esi ret .size bn_sqr_comba4,.-.L_bn_sqr_comba4_begin + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #else .text .globl bn_mul_comba8 @@ -1260,6 +1297,11 @@ .align 16 bn_mul_comba8: .L_bn_mul_comba8_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %esi movl 12(%esp),%esi pushl %edi @@ -1804,6 +1846,11 @@ .align 16 bn_mul_comba4: .L_bn_mul_comba4_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %esi movl 12(%esp),%esi pushl %edi @@ -1972,6 +2019,11 @@ .align 16 bn_sqr_comba8: .L_bn_sqr_comba8_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %esi pushl %edi pushl %ebp @@ -2380,6 +2432,11 @@ .align 16 bn_sqr_comba4: .L_bn_sqr_comba4_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %esi pushl %edi pushl %ebp @@ -2507,4 +2564,21 @@ popl %esi ret .size bn_sqr_comba4,.-.L_bn_sqr_comba4_begin + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #endif diff --git a/sys/crypto/openssl/i386/crypt586.S b/sys/crypto/openssl/i386/crypt586.S --- a/sys/crypto/openssl/i386/crypt586.S +++ b/sys/crypto/openssl/i386/crypt586.S @@ -6,6 +6,11 @@ .align 16 fcrypt_body: .L_fcrypt_body_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -878,6 +883,23 @@ popl %ebp ret .size fcrypt_body,.-.L_fcrypt_body_begin + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #else .text .globl fcrypt_body @@ -885,6 +907,11 @@ .align 16 fcrypt_body: .L_fcrypt_body_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -1753,4 +1780,21 @@ popl %ebp ret .size fcrypt_body,.-.L_fcrypt_body_begin + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #endif diff --git a/sys/crypto/openssl/i386/des-586.S b/sys/crypto/openssl/i386/des-586.S --- a/sys/crypto/openssl/i386/des-586.S +++ b/sys/crypto/openssl/i386/des-586.S @@ -5,6 +5,11 @@ .type _x86_DES_encrypt,@function .align 16 _x86_DES_encrypt: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ecx movl (%ecx),%eax @@ -476,6 +481,11 @@ .type _x86_DES_decrypt,@function .align 16 _x86_DES_decrypt: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ecx movl 120(%ecx),%eax @@ -949,6 +959,11 @@ .align 16 DES_encrypt1: .L_DES_encrypt1_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %esi pushl %edi @@ -1062,6 +1077,11 @@ .align 16 DES_encrypt2: .L_DES_encrypt2_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %esi pushl %edi @@ -1105,6 +1125,11 @@ .align 16 DES_encrypt3: .L_DES_encrypt3_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebx movl 8(%esp),%ebx pushl %ebp @@ -1226,6 +1251,11 @@ .align 16 DES_decrypt3: .L_DES_decrypt3_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebx movl 8(%esp),%ebx pushl %ebp @@ -1347,6 +1377,11 @@ .align 16 DES_ncbc_encrypt: .L_DES_ncbc_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx @@ -1408,21 +1443,56 @@ xorl %edx,%edx jmp *%ebp .L012ej7: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 6(%esi),%dh shll $8,%edx .L013ej6: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 5(%esi),%dh .L014ej5: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 4(%esi),%dl .L015ej4: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl (%esi),%ecx jmp .L016ejend .L017ej3: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 2(%esi),%ch shll $8,%ecx .L018ej2: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 1(%esi),%ch .L019ej1: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb (%esi),%cl .L016ejend: xorl %ecx,%eax @@ -1527,6 +1597,11 @@ .align 16 DES_ede3_cbc_encrypt: .L_DES_ede3_cbc_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx @@ -1592,21 +1667,56 @@ xorl %edx,%edx jmp *%ebp .L036ej7: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 6(%esi),%dh shll $8,%edx .L037ej6: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 5(%esi),%dh .L038ej5: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 4(%esi),%dl .L039ej4: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl (%esi),%ecx jmp .L040ejend .L041ej3: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 2(%esi),%ch shll $8,%ecx .L042ej2: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 1(%esi),%ch .L043ej1: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb (%esi),%cl .L040ejend: xorl %ecx,%eax @@ -1837,12 +1947,34 @@ .long 8519680,131200,537002112,545259520 .long 128,545390592,8519808,0 .long 536870912,545259648,131072,8519808 + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #else .text .globl DES_SPtrans .type _x86_DES_encrypt,@function .align 16 _x86_DES_encrypt: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ecx movl (%ecx),%eax @@ -2314,6 +2446,11 @@ .type _x86_DES_decrypt,@function .align 16 _x86_DES_decrypt: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ecx movl 120(%ecx),%eax @@ -2787,6 +2924,11 @@ .align 16 DES_encrypt1: .L_DES_encrypt1_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %esi pushl %edi @@ -2900,6 +3042,11 @@ .align 16 DES_encrypt2: .L_DES_encrypt2_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %esi pushl %edi @@ -2943,6 +3090,11 @@ .align 16 DES_encrypt3: .L_DES_encrypt3_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebx movl 8(%esp),%ebx pushl %ebp @@ -3064,6 +3216,11 @@ .align 16 DES_decrypt3: .L_DES_decrypt3_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebx movl 8(%esp),%ebx pushl %ebp @@ -3185,6 +3342,11 @@ .align 16 DES_ncbc_encrypt: .L_DES_ncbc_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx @@ -3246,21 +3408,56 @@ xorl %edx,%edx jmp *%ebp .L012ej7: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 6(%esi),%dh shll $8,%edx .L013ej6: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 5(%esi),%dh .L014ej5: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 4(%esi),%dl .L015ej4: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl (%esi),%ecx jmp .L016ejend .L017ej3: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 2(%esi),%ch shll $8,%ecx .L018ej2: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 1(%esi),%ch .L019ej1: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb (%esi),%cl .L016ejend: xorl %ecx,%eax @@ -3365,6 +3562,11 @@ .align 16 DES_ede3_cbc_encrypt: .L_DES_ede3_cbc_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx @@ -3430,21 +3632,56 @@ xorl %edx,%edx jmp *%ebp .L036ej7: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 6(%esi),%dh shll $8,%edx .L037ej6: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 5(%esi),%dh .L038ej5: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 4(%esi),%dl .L039ej4: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl (%esi),%ecx jmp .L040ejend .L041ej3: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 2(%esi),%ch shll $8,%ecx .L042ej2: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 1(%esi),%ch .L043ej1: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb (%esi),%cl .L040ejend: xorl %ecx,%eax @@ -3675,4 +3912,21 @@ .long 8519680,131200,537002112,545259520 .long 128,545390592,8519808,0 .long 536870912,545259648,131072,8519808 + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #endif diff --git a/sys/crypto/openssl/i386/e_padlock-x86.S b/sys/crypto/openssl/i386/e_padlock-x86.S --- a/sys/crypto/openssl/i386/e_padlock-x86.S +++ b/sys/crypto/openssl/i386/e_padlock-x86.S @@ -6,6 +6,11 @@ .align 16 padlock_capability: .L_padlock_capability_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebx pushfl popl %eax @@ -66,6 +71,11 @@ .align 16 padlock_key_bswap: .L_padlock_key_bswap_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl 4(%esp),%edx movl 240(%edx),%ecx incl %ecx @@ -84,6 +94,11 @@ .align 16 padlock_verify_context: .L_padlock_verify_context_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl 4(%esp),%edx leal .Lpadlock_saved_context-.L004verify_pic_point,%eax pushfl @@ -95,6 +110,11 @@ .type _padlock_verify_ctx,@function .align 16 _padlock_verify_ctx: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + addl (%esp),%eax btl $30,4(%esp) jnc .L005verified @@ -111,6 +131,11 @@ .align 16 padlock_reload_key: .L_padlock_reload_key_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushfl popfl ret @@ -120,6 +145,11 @@ .align 16 padlock_aes_block: .L_padlock_aes_block_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %edi pushl %esi pushl %ebx @@ -140,6 +170,11 @@ .align 16 padlock_ecb_encrypt: .L_padlock_ecb_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -319,6 +354,11 @@ .align 16 padlock_cbc_encrypt: .L_padlock_cbc_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -502,6 +542,11 @@ .align 16 padlock_cfb_encrypt: .L_padlock_cfb_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -624,6 +669,11 @@ .align 16 padlock_ofb_encrypt: .L_padlock_ofb_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -746,6 +796,11 @@ .align 16 padlock_ctr32_encrypt: .L_padlock_ctr32_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -853,6 +908,11 @@ .align 16 padlock_xstore: .L_padlock_xstore_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %edi movl 8(%esp),%edi movl 12(%esp),%edx @@ -863,6 +923,11 @@ .type _win32_segv_handler,@function .align 16 _win32_segv_handler: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl $1,%eax movl 4(%esp),%edx movl 12(%esp),%ecx @@ -878,6 +943,11 @@ .align 16 padlock_sha1_oneshot: .L_padlock_sha1_oneshot_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %edi pushl %esi xorl %eax,%eax @@ -909,6 +979,11 @@ .align 16 padlock_sha1_blocks: .L_padlock_sha1_blocks_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %edi pushl %esi movl 12(%esp),%edi @@ -939,6 +1014,11 @@ .align 16 padlock_sha256_oneshot: .L_padlock_sha256_oneshot_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %edi pushl %esi xorl %eax,%eax @@ -970,6 +1050,11 @@ .align 16 padlock_sha256_blocks: .L_padlock_sha256_blocks_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %edi pushl %esi movl 12(%esp),%edi @@ -1000,6 +1085,11 @@ .align 16 padlock_sha512_blocks: .L_padlock_sha512_blocks_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %edi pushl %esi movl 12(%esp),%edi @@ -1041,6 +1131,23 @@ .align 4 .Lpadlock_saved_context: .long 0 + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #else .text .globl padlock_capability @@ -1048,6 +1155,11 @@ .align 16 padlock_capability: .L_padlock_capability_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebx pushfl popl %eax @@ -1108,6 +1220,11 @@ .align 16 padlock_key_bswap: .L_padlock_key_bswap_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl 4(%esp),%edx movl 240(%edx),%ecx incl %ecx @@ -1126,6 +1243,11 @@ .align 16 padlock_verify_context: .L_padlock_verify_context_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl 4(%esp),%edx leal .Lpadlock_saved_context-.L004verify_pic_point,%eax pushfl @@ -1137,6 +1259,11 @@ .type _padlock_verify_ctx,@function .align 16 _padlock_verify_ctx: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + addl (%esp),%eax btl $30,4(%esp) jnc .L005verified @@ -1153,6 +1280,11 @@ .align 16 padlock_reload_key: .L_padlock_reload_key_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushfl popfl ret @@ -1162,6 +1294,11 @@ .align 16 padlock_aes_block: .L_padlock_aes_block_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %edi pushl %esi pushl %ebx @@ -1182,6 +1319,11 @@ .align 16 padlock_ecb_encrypt: .L_padlock_ecb_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -1361,6 +1503,11 @@ .align 16 padlock_cbc_encrypt: .L_padlock_cbc_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -1544,6 +1691,11 @@ .align 16 padlock_cfb_encrypt: .L_padlock_cfb_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -1666,6 +1818,11 @@ .align 16 padlock_ofb_encrypt: .L_padlock_ofb_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -1788,6 +1945,11 @@ .align 16 padlock_ctr32_encrypt: .L_padlock_ctr32_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -1895,6 +2057,11 @@ .align 16 padlock_xstore: .L_padlock_xstore_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %edi movl 8(%esp),%edi movl 12(%esp),%edx @@ -1905,6 +2072,11 @@ .type _win32_segv_handler,@function .align 16 _win32_segv_handler: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl $1,%eax movl 4(%esp),%edx movl 12(%esp),%ecx @@ -1920,6 +2092,11 @@ .align 16 padlock_sha1_oneshot: .L_padlock_sha1_oneshot_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %edi pushl %esi xorl %eax,%eax @@ -1951,6 +2128,11 @@ .align 16 padlock_sha1_blocks: .L_padlock_sha1_blocks_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %edi pushl %esi movl 12(%esp),%edi @@ -1981,6 +2163,11 @@ .align 16 padlock_sha256_oneshot: .L_padlock_sha256_oneshot_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %edi pushl %esi xorl %eax,%eax @@ -2012,6 +2199,11 @@ .align 16 padlock_sha256_blocks: .L_padlock_sha256_blocks_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %edi pushl %esi movl 12(%esp),%edi @@ -2042,6 +2234,11 @@ .align 16 padlock_sha512_blocks: .L_padlock_sha512_blocks_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %edi pushl %esi movl 12(%esp),%edi @@ -2083,4 +2280,21 @@ .align 4 .Lpadlock_saved_context: .long 0 + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #endif diff --git a/sys/crypto/openssl/i386/ecp_nistz256-x86.S b/sys/crypto/openssl/i386/ecp_nistz256-x86.S --- a/sys/crypto/openssl/i386/ecp_nistz256-x86.S +++ b/sys/crypto/openssl/i386/ecp_nistz256-x86.S @@ -2389,6 +2389,11 @@ .align 16 ecp_nistz256_mul_by_2: .L_ecp_nistz256_mul_by_2_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -2408,6 +2413,11 @@ .align 16 ecp_nistz256_mul_by_3: .L_ecp_nistz256_mul_by_3_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -2433,6 +2443,11 @@ .align 16 ecp_nistz256_div_by_2: .L_ecp_nistz256_div_by_2_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -2449,6 +2464,11 @@ .type _ecp_nistz256_div_by_2,@function .align 16 _ecp_nistz256_div_by_2: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl (%esi),%ebp xorl %edx,%edx movl 4(%esi),%ebx @@ -2532,6 +2552,11 @@ .align 16 ecp_nistz256_add: .L_ecp_nistz256_add_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -2549,6 +2574,11 @@ .type _ecp_nistz256_add,@function .align 16 _ecp_nistz256_add: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl (%esi),%eax movl 4(%esi),%ebx movl 8(%esi),%ecx @@ -2626,6 +2656,11 @@ .align 16 ecp_nistz256_sub: .L_ecp_nistz256_sub_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -2643,6 +2678,11 @@ .type _ecp_nistz256_sub,@function .align 16 _ecp_nistz256_sub: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl (%esi),%eax movl 4(%esi),%ebx movl 8(%esi),%ecx @@ -2701,6 +2741,11 @@ .align 16 ecp_nistz256_neg: .L_ecp_nistz256_neg_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -2729,6 +2774,11 @@ .type _picup_eax,@function .align 16 _picup_eax: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl (%esp),%eax ret .size _picup_eax,.-_picup_eax @@ -2737,6 +2787,11 @@ .align 16 ecp_nistz256_to_mont: .L_ecp_nistz256_to_mont_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -2760,6 +2815,11 @@ .align 16 ecp_nistz256_from_mont: .L_ecp_nistz256_from_mont_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -2783,6 +2843,11 @@ .align 16 ecp_nistz256_mul_mont: .L_ecp_nistz256_mul_mont_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -2806,6 +2871,11 @@ .align 16 ecp_nistz256_sqr_mont: .L_ecp_nistz256_sqr_mont_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -2827,6 +2897,11 @@ .type _ecp_nistz256_mul_mont,@function .align 16 _ecp_nistz256_mul_mont: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + andl $83886080,%eax cmpl $83886080,%eax jne .L004mul_mont_ialu @@ -3724,6 +3799,11 @@ .align 16 ecp_nistz256_scatter_w5: .L_ecp_nistz256_scatter_w5_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -3757,6 +3837,11 @@ .align 16 ecp_nistz256_gather_w5: .L_ecp_nistz256_gather_w5_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -3851,6 +3936,11 @@ .align 16 ecp_nistz256_scatter_w7: .L_ecp_nistz256_scatter_w7_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -3882,6 +3972,11 @@ .align 16 ecp_nistz256_gather_w7: .L_ecp_nistz256_gather_w7_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -4096,6 +4191,11 @@ .align 16 ecp_nistz256_point_double: .L_ecp_nistz256_point_double_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -4222,6 +4322,11 @@ .align 16 ecp_nistz256_point_add: .L_ecp_nistz256_point_add_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -4735,6 +4840,11 @@ .align 16 ecp_nistz256_point_add_affine: .L_ecp_nistz256_point_add_affine_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -5163,6 +5273,23 @@ ret .size ecp_nistz256_point_add_affine,.-.L_ecp_nistz256_point_add_affine_begin .comm OPENSSL_ia32cap_P,16,4 + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #else .text .globl ecp_nistz256_precomputed @@ -7553,6 +7680,11 @@ .align 16 ecp_nistz256_mul_by_2: .L_ecp_nistz256_mul_by_2_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -7572,6 +7704,11 @@ .align 16 ecp_nistz256_mul_by_3: .L_ecp_nistz256_mul_by_3_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -7597,6 +7734,11 @@ .align 16 ecp_nistz256_div_by_2: .L_ecp_nistz256_div_by_2_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -7613,6 +7755,11 @@ .type _ecp_nistz256_div_by_2,@function .align 16 _ecp_nistz256_div_by_2: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl (%esi),%ebp xorl %edx,%edx movl 4(%esi),%ebx @@ -7696,6 +7843,11 @@ .align 16 ecp_nistz256_add: .L_ecp_nistz256_add_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -7713,6 +7865,11 @@ .type _ecp_nistz256_add,@function .align 16 _ecp_nistz256_add: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl (%esi),%eax movl 4(%esi),%ebx movl 8(%esi),%ecx @@ -7790,6 +7947,11 @@ .align 16 ecp_nistz256_sub: .L_ecp_nistz256_sub_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -7807,6 +7969,11 @@ .type _ecp_nistz256_sub,@function .align 16 _ecp_nistz256_sub: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl (%esi),%eax movl 4(%esi),%ebx movl 8(%esi),%ecx @@ -7865,6 +8032,11 @@ .align 16 ecp_nistz256_neg: .L_ecp_nistz256_neg_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -7893,6 +8065,11 @@ .type _picup_eax,@function .align 16 _picup_eax: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl (%esp),%eax ret .size _picup_eax,.-_picup_eax @@ -7901,6 +8078,11 @@ .align 16 ecp_nistz256_to_mont: .L_ecp_nistz256_to_mont_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -7924,6 +8106,11 @@ .align 16 ecp_nistz256_from_mont: .L_ecp_nistz256_from_mont_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -7947,6 +8134,11 @@ .align 16 ecp_nistz256_mul_mont: .L_ecp_nistz256_mul_mont_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -7970,6 +8162,11 @@ .align 16 ecp_nistz256_sqr_mont: .L_ecp_nistz256_sqr_mont_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -7991,6 +8188,11 @@ .type _ecp_nistz256_mul_mont,@function .align 16 _ecp_nistz256_mul_mont: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + andl $83886080,%eax cmpl $83886080,%eax jne .L004mul_mont_ialu @@ -8888,6 +9090,11 @@ .align 16 ecp_nistz256_scatter_w5: .L_ecp_nistz256_scatter_w5_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -8921,6 +9128,11 @@ .align 16 ecp_nistz256_gather_w5: .L_ecp_nistz256_gather_w5_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -9015,6 +9227,11 @@ .align 16 ecp_nistz256_scatter_w7: .L_ecp_nistz256_scatter_w7_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -9046,6 +9263,11 @@ .align 16 ecp_nistz256_gather_w7: .L_ecp_nistz256_gather_w7_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -9260,6 +9482,11 @@ .align 16 ecp_nistz256_point_double: .L_ecp_nistz256_point_double_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -9386,6 +9613,11 @@ .align 16 ecp_nistz256_point_add: .L_ecp_nistz256_point_add_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -9899,6 +10131,11 @@ .align 16 ecp_nistz256_point_add_affine: .L_ecp_nistz256_point_add_affine_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -10327,4 +10564,21 @@ ret .size ecp_nistz256_point_add_affine,.-.L_ecp_nistz256_point_add_affine_begin .comm OPENSSL_ia32cap_P,16,4 + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #endif diff --git a/sys/crypto/openssl/i386/ghash-x86.S b/sys/crypto/openssl/i386/ghash-x86.S --- a/sys/crypto/openssl/i386/ghash-x86.S +++ b/sys/crypto/openssl/i386/ghash-x86.S @@ -6,6 +6,11 @@ .align 16 gcm_gmult_4bit_x86: .L_gcm_gmult_4bit_x86_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -100,6 +105,11 @@ .align 16 gcm_ghash_4bit_x86: .L_gcm_ghash_4bit_x86_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -209,6 +219,11 @@ .align 16 gcm_gmult_4bit_mmx: .L_gcm_gmult_4bit_mmx_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -308,6 +323,11 @@ .align 16 gcm_ghash_4bit_mmx: .L_gcm_ghash_4bit_mmx_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -912,6 +932,11 @@ .align 16 gcm_init_clmul: .L_gcm_init_clmul_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl 4(%esp),%edx movl 8(%esp),%eax call .L010pic @@ -981,6 +1006,11 @@ .align 16 gcm_gmult_clmul: .L_gcm_gmult_clmul_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl 4(%esp),%eax movl 8(%esp),%edx call .L011pic @@ -1034,6 +1064,11 @@ .align 16 gcm_ghash_clmul: .L_gcm_ghash_clmul_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -1264,6 +1299,23 @@ .byte 82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112 .byte 112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62 .byte 0 + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #else .text .globl gcm_gmult_4bit_x86 @@ -1271,6 +1323,11 @@ .align 16 gcm_gmult_4bit_x86: .L_gcm_gmult_4bit_x86_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -1365,6 +1422,11 @@ .align 16 gcm_ghash_4bit_x86: .L_gcm_ghash_4bit_x86_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -1474,6 +1536,11 @@ .align 16 gcm_gmult_4bit_mmx: .L_gcm_gmult_4bit_mmx_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -1573,6 +1640,11 @@ .align 16 gcm_ghash_4bit_mmx: .L_gcm_ghash_4bit_mmx_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -2177,6 +2249,11 @@ .align 16 gcm_init_clmul: .L_gcm_init_clmul_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl 4(%esp),%edx movl 8(%esp),%eax call .L010pic @@ -2246,6 +2323,11 @@ .align 16 gcm_gmult_clmul: .L_gcm_gmult_clmul_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl 4(%esp),%eax movl 8(%esp),%edx call .L011pic @@ -2299,6 +2381,11 @@ .align 16 gcm_ghash_clmul: .L_gcm_ghash_clmul_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -2529,4 +2616,21 @@ .byte 82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112 .byte 112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62 .byte 0 + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #endif diff --git a/sys/crypto/openssl/i386/md5-586.S b/sys/crypto/openssl/i386/md5-586.S --- a/sys/crypto/openssl/i386/md5-586.S +++ b/sys/crypto/openssl/i386/md5-586.S @@ -1,11 +1,16 @@ /* Do not modify. This file is auto-generated from md5-586.pl. */ #ifdef PIC .text -.globl md5_block_asm_data_order -.type md5_block_asm_data_order,@function +.globl ossl_md5_block_asm_data_order +.type ossl_md5_block_asm_data_order,@function .align 16 -md5_block_asm_data_order: -.L_md5_block_asm_data_order_begin: +ossl_md5_block_asm_data_order: +.L_ossl_md5_block_asm_data_order_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %esi pushl %edi movl 12(%esp),%edi @@ -677,14 +682,36 @@ popl %edi popl %esi ret -.size md5_block_asm_data_order,.-.L_md5_block_asm_data_order_begin +.size ossl_md5_block_asm_data_order,.-.L_ossl_md5_block_asm_data_order_begin + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #else .text -.globl md5_block_asm_data_order -.type md5_block_asm_data_order,@function +.globl ossl_md5_block_asm_data_order +.type ossl_md5_block_asm_data_order,@function .align 16 -md5_block_asm_data_order: -.L_md5_block_asm_data_order_begin: +ossl_md5_block_asm_data_order: +.L_ossl_md5_block_asm_data_order_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %esi pushl %edi movl 12(%esp),%edi @@ -1356,5 +1383,22 @@ popl %edi popl %esi ret -.size md5_block_asm_data_order,.-.L_md5_block_asm_data_order_begin +.size ossl_md5_block_asm_data_order,.-.L_ossl_md5_block_asm_data_order_begin + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #endif diff --git a/sys/crypto/openssl/i386/poly1305-x86.S b/sys/crypto/openssl/i386/poly1305-x86.S --- a/sys/crypto/openssl/i386/poly1305-x86.S +++ b/sys/crypto/openssl/i386/poly1305-x86.S @@ -7,6 +7,11 @@ .align 16 poly1305_init: .L_poly1305_init_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -68,6 +73,11 @@ .align 16 poly1305_blocks: .L_poly1305_blocks_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -236,6 +246,11 @@ .align 16 poly1305_emit: .L_poly1305_emit_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -295,6 +310,11 @@ .type _poly1305_init_sse2,@function .align 16 _poly1305_init_sse2: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movdqu 24(%edi),%xmm4 leal 48(%edi),%edi movl %esp,%ebp @@ -497,6 +517,11 @@ .type _poly1305_blocks_sse2,@function .align 16 _poly1305_blocks_sse2: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -1258,6 +1283,11 @@ .type _poly1305_emit_sse2,@function .align 16 _poly1305_emit_sse2: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -1351,6 +1381,11 @@ .type _poly1305_init_avx2,@function .align 16 _poly1305_init_avx2: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + vmovdqu 24(%edi),%xmm4 leal 48(%edi),%edi movl %esp,%ebp @@ -1522,6 +1557,11 @@ .type _poly1305_blocks_avx2,@function .align 16 _poly1305_blocks_avx2: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -1910,6 +1950,23 @@ .byte 114,103,62,0 .align 4 .comm OPENSSL_ia32cap_P,16,4 + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #else .text .align 64 @@ -1918,6 +1975,11 @@ .align 16 poly1305_init: .L_poly1305_init_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -1979,6 +2041,11 @@ .align 16 poly1305_blocks: .L_poly1305_blocks_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -2147,6 +2214,11 @@ .align 16 poly1305_emit: .L_poly1305_emit_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -2206,6 +2278,11 @@ .type _poly1305_init_sse2,@function .align 16 _poly1305_init_sse2: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movdqu 24(%edi),%xmm4 leal 48(%edi),%edi movl %esp,%ebp @@ -2408,6 +2485,11 @@ .type _poly1305_blocks_sse2,@function .align 16 _poly1305_blocks_sse2: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -3169,6 +3251,11 @@ .type _poly1305_emit_sse2,@function .align 16 _poly1305_emit_sse2: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -3262,6 +3349,11 @@ .type _poly1305_init_avx2,@function .align 16 _poly1305_init_avx2: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + vmovdqu 24(%edi),%xmm4 leal 48(%edi),%edi movl %esp,%ebp @@ -3433,6 +3525,11 @@ .type _poly1305_blocks_avx2,@function .align 16 _poly1305_blocks_avx2: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -3821,4 +3918,21 @@ .byte 114,103,62,0 .align 4 .comm OPENSSL_ia32cap_P,16,4 + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #endif diff --git a/sys/crypto/openssl/i386/rc4-586.S b/sys/crypto/openssl/i386/rc4-586.S --- a/sys/crypto/openssl/i386/rc4-586.S +++ b/sys/crypto/openssl/i386/rc4-586.S @@ -6,6 +6,11 @@ .align 16 RC4: .L_RC4_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -272,6 +277,11 @@ .align 16 RC4_set_key: .L_RC4_set_key_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -350,6 +360,11 @@ .align 16 RC4_options: .L_RC4_options_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + call .L018pic_point .L018pic_point: popl %eax @@ -380,6 +395,23 @@ .align 64 .size RC4_options,.-.L_RC4_options_begin .comm OPENSSL_ia32cap_P,16,4 + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #else .text .globl RC4 @@ -387,6 +419,11 @@ .align 16 RC4: .L_RC4_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -650,6 +687,11 @@ .align 16 RC4_set_key: .L_RC4_set_key_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -725,6 +767,11 @@ .align 16 RC4_options: .L_RC4_options_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + call .L016pic_point .L016pic_point: popl %eax @@ -752,4 +799,21 @@ .align 64 .size RC4_options,.-.L_RC4_options_begin .comm OPENSSL_ia32cap_P,16,4 + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #endif diff --git a/sys/crypto/openssl/i386/rc5-586.S b/sys/crypto/openssl/i386/rc5-586.S --- a/sys/crypto/openssl/i386/rc5-586.S +++ b/sys/crypto/openssl/i386/rc5-586.S @@ -6,6 +6,11 @@ .align 16 RC5_32_encrypt: .L_RC5_32_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %esi @@ -197,6 +202,11 @@ .align 16 RC5_32_decrypt: .L_RC5_32_decrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %esi @@ -390,6 +400,11 @@ .align 16 RC5_32_cbc_encrypt: .L_RC5_32_cbc_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx @@ -449,21 +464,56 @@ xorl %edx,%edx jmp *%ebp .L010ej7: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 6(%esi),%dh shll $8,%edx .L011ej6: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 5(%esi),%dh .L012ej5: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 4(%esi),%dl .L013ej4: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl (%esi),%ecx jmp .L014ejend .L015ej3: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 2(%esi),%ch shll $8,%ecx .L016ej2: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 1(%esi),%ch .L017ej1: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb (%esi),%cl .L014ejend: xorl %ecx,%eax @@ -563,6 +613,23 @@ .long .L010ej7-.L008PIC_point .align 64 .size RC5_32_cbc_encrypt,.-.L_RC5_32_cbc_encrypt_begin + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #else .text .globl RC5_32_encrypt @@ -570,6 +637,11 @@ .align 16 RC5_32_encrypt: .L_RC5_32_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %esi @@ -761,6 +833,11 @@ .align 16 RC5_32_decrypt: .L_RC5_32_decrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %esi @@ -954,6 +1031,11 @@ .align 16 RC5_32_cbc_encrypt: .L_RC5_32_cbc_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx @@ -1013,21 +1095,56 @@ xorl %edx,%edx jmp *%ebp .L010ej7: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 6(%esi),%dh shll $8,%edx .L011ej6: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 5(%esi),%dh .L012ej5: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 4(%esi),%dl .L013ej4: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl (%esi),%ecx jmp .L014ejend .L015ej3: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 2(%esi),%ch shll $8,%ecx .L016ej2: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 1(%esi),%ch .L017ej1: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb (%esi),%cl .L014ejend: xorl %ecx,%eax @@ -1127,4 +1244,21 @@ .long .L010ej7-.L008PIC_point .align 64 .size RC5_32_cbc_encrypt,.-.L_RC5_32_cbc_encrypt_begin + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #endif diff --git a/sys/crypto/openssl/i386/rmd-586.S b/sys/crypto/openssl/i386/rmd-586.S --- a/sys/crypto/openssl/i386/rmd-586.S +++ b/sys/crypto/openssl/i386/rmd-586.S @@ -6,6 +6,11 @@ .align 16 ripemd160_block_asm_data_order: .L_ripemd160_block_asm_data_order_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl 4(%esp),%edx movl 8(%esp),%eax pushl %esi @@ -1964,6 +1969,23 @@ popl %esi ret .size ripemd160_block_asm_data_order,.-.L_ripemd160_block_asm_data_order_begin + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #else .text .globl ripemd160_block_asm_data_order @@ -1971,6 +1993,11 @@ .align 16 ripemd160_block_asm_data_order: .L_ripemd160_block_asm_data_order_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl 4(%esp),%edx movl 8(%esp),%eax pushl %esi @@ -3929,4 +3956,21 @@ popl %esi ret .size ripemd160_block_asm_data_order,.-.L_ripemd160_block_asm_data_order_begin + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #endif diff --git a/sys/crypto/openssl/i386/sha1-586.S b/sys/crypto/openssl/i386/sha1-586.S --- a/sys/crypto/openssl/i386/sha1-586.S +++ b/sys/crypto/openssl/i386/sha1-586.S @@ -6,6 +6,11 @@ .align 16 sha1_block_data_order: .L_sha1_block_data_order_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -1400,6 +1405,11 @@ .type _sha1_block_data_order_shaext,@function .align 16 _sha1_block_data_order_shaext: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -1570,6 +1580,11 @@ .type _sha1_block_data_order_ssse3,@function .align 16 _sha1_block_data_order_ssse3: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -2789,6 +2804,11 @@ .type _sha1_block_data_order_avx,@function .align 16 _sha1_block_data_order_avx: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -3969,6 +3989,23 @@ .byte 89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112 .byte 114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .comm OPENSSL_ia32cap_P,16,4 + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #else .text .globl sha1_block_data_order @@ -3976,6 +4013,11 @@ .align 16 sha1_block_data_order: .L_sha1_block_data_order_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -5370,6 +5412,11 @@ .type _sha1_block_data_order_shaext,@function .align 16 _sha1_block_data_order_shaext: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -5540,6 +5587,11 @@ .type _sha1_block_data_order_ssse3,@function .align 16 _sha1_block_data_order_ssse3: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -6759,6 +6811,11 @@ .type _sha1_block_data_order_avx,@function .align 16 _sha1_block_data_order_avx: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -7939,4 +7996,21 @@ .byte 89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112 .byte 114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .comm OPENSSL_ia32cap_P,16,4 + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #endif diff --git a/sys/crypto/openssl/i386/sha256-586.S b/sys/crypto/openssl/i386/sha256-586.S --- a/sys/crypto/openssl/i386/sha256-586.S +++ b/sys/crypto/openssl/i386/sha256-586.S @@ -6,6 +6,11 @@ .align 16 sha256_block_data_order: .L_sha256_block_data_order_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -6782,6 +6787,23 @@ ret .size sha256_block_data_order,.-.L_sha256_block_data_order_begin .comm OPENSSL_ia32cap_P,16,4 + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #else .text .globl sha256_block_data_order @@ -6789,6 +6811,11 @@ .align 16 sha256_block_data_order: .L_sha256_block_data_order_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -13565,4 +13592,21 @@ ret .size sha256_block_data_order,.-.L_sha256_block_data_order_begin .comm OPENSSL_ia32cap_P,16,4 + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #endif diff --git a/sys/crypto/openssl/i386/sha512-586.S b/sys/crypto/openssl/i386/sha512-586.S --- a/sys/crypto/openssl/i386/sha512-586.S +++ b/sys/crypto/openssl/i386/sha512-586.S @@ -6,6 +6,11 @@ .align 16 sha512_block_data_order: .L_sha512_block_data_order_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -2828,6 +2833,23 @@ .byte 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103 .byte 62,0 .comm OPENSSL_ia32cap_P,16,4 + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #else .text .globl sha512_block_data_order @@ -2835,6 +2857,11 @@ .align 16 sha512_block_data_order: .L_sha512_block_data_order_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -5657,4 +5684,21 @@ .byte 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103 .byte 62,0 .comm OPENSSL_ia32cap_P,16,4 + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #endif diff --git a/sys/crypto/openssl/i386/vpaes-x86.S b/sys/crypto/openssl/i386/vpaes-x86.S --- a/sys/crypto/openssl/i386/vpaes-x86.S +++ b/sys/crypto/openssl/i386/vpaes-x86.S @@ -61,6 +61,11 @@ .type _vpaes_preheat,@function .align 16 _vpaes_preheat: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + addl (%esp),%ebp movdqa -48(%ebp),%xmm7 movdqa -16(%ebp),%xmm6 @@ -69,6 +74,11 @@ .type _vpaes_encrypt_core,@function .align 16 _vpaes_encrypt_core: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl $16,%ecx movl 240(%edx),%eax movdqa %xmm6,%xmm1 @@ -146,6 +156,11 @@ .type _vpaes_decrypt_core,@function .align 16 _vpaes_decrypt_core: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + leal 608(%ebp),%ebx movl 240(%edx),%eax movdqa %xmm6,%xmm1 @@ -234,6 +249,11 @@ .type _vpaes_schedule_core,@function .align 16 _vpaes_schedule_core: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + addl (%esp),%ebp movdqu (%esi),%xmm0 movdqa 320(%ebp),%xmm2 @@ -328,6 +348,11 @@ .type _vpaes_schedule_192_smear,@function .align 16 _vpaes_schedule_192_smear: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pshufd $128,%xmm6,%xmm1 pshufd $254,%xmm7,%xmm0 pxor %xmm1,%xmm6 @@ -340,6 +365,11 @@ .type _vpaes_schedule_round,@function .align 16 _vpaes_schedule_round: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movdqa 8(%esp),%xmm2 pxor %xmm1,%xmm1 .byte 102,15,58,15,202,15 @@ -389,6 +419,11 @@ .type _vpaes_schedule_transform,@function .align 16 _vpaes_schedule_transform: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movdqa -16(%ebp),%xmm2 movdqa %xmm2,%xmm1 pandn %xmm0,%xmm1 @@ -404,6 +439,11 @@ .type _vpaes_schedule_mangle,@function .align 16 _vpaes_schedule_mangle: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movdqa %xmm0,%xmm4 movdqa 128(%ebp),%xmm5 testl %edi,%edi @@ -465,6 +505,11 @@ .align 16 vpaes_set_encrypt_key: .L_vpaes_set_encrypt_key_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -498,6 +543,11 @@ .align 16 vpaes_set_decrypt_key: .L_vpaes_set_decrypt_key_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -536,6 +586,11 @@ .align 16 vpaes_encrypt: .L_vpaes_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -565,6 +620,11 @@ .align 16 vpaes_decrypt: .L_vpaes_decrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -594,6 +654,11 @@ .align 16 vpaes_cbc_encrypt: .L_vpaes_cbc_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -660,6 +725,23 @@ popl %ebp ret .size vpaes_cbc_encrypt,.-.L_vpaes_cbc_encrypt_begin + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #else .text .align 64 @@ -722,6 +804,11 @@ .type _vpaes_preheat,@function .align 16 _vpaes_preheat: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + addl (%esp),%ebp movdqa -48(%ebp),%xmm7 movdqa -16(%ebp),%xmm6 @@ -730,6 +817,11 @@ .type _vpaes_encrypt_core,@function .align 16 _vpaes_encrypt_core: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl $16,%ecx movl 240(%edx),%eax movdqa %xmm6,%xmm1 @@ -807,6 +899,11 @@ .type _vpaes_decrypt_core,@function .align 16 _vpaes_decrypt_core: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + leal 608(%ebp),%ebx movl 240(%edx),%eax movdqa %xmm6,%xmm1 @@ -895,6 +992,11 @@ .type _vpaes_schedule_core,@function .align 16 _vpaes_schedule_core: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + addl (%esp),%ebp movdqu (%esi),%xmm0 movdqa 320(%ebp),%xmm2 @@ -989,6 +1091,11 @@ .type _vpaes_schedule_192_smear,@function .align 16 _vpaes_schedule_192_smear: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pshufd $128,%xmm6,%xmm1 pshufd $254,%xmm7,%xmm0 pxor %xmm1,%xmm6 @@ -1001,6 +1108,11 @@ .type _vpaes_schedule_round,@function .align 16 _vpaes_schedule_round: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movdqa 8(%esp),%xmm2 pxor %xmm1,%xmm1 .byte 102,15,58,15,202,15 @@ -1050,6 +1162,11 @@ .type _vpaes_schedule_transform,@function .align 16 _vpaes_schedule_transform: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movdqa -16(%ebp),%xmm2 movdqa %xmm2,%xmm1 pandn %xmm0,%xmm1 @@ -1065,6 +1182,11 @@ .type _vpaes_schedule_mangle,@function .align 16 _vpaes_schedule_mangle: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movdqa %xmm0,%xmm4 movdqa 128(%ebp),%xmm5 testl %edi,%edi @@ -1126,6 +1248,11 @@ .align 16 vpaes_set_encrypt_key: .L_vpaes_set_encrypt_key_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -1159,6 +1286,11 @@ .align 16 vpaes_set_decrypt_key: .L_vpaes_set_decrypt_key_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -1197,6 +1329,11 @@ .align 16 vpaes_encrypt: .L_vpaes_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -1226,6 +1363,11 @@ .align 16 vpaes_decrypt: .L_vpaes_decrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -1255,6 +1397,11 @@ .align 16 vpaes_cbc_encrypt: .L_vpaes_cbc_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -1321,4 +1468,21 @@ popl %ebp ret .size vpaes_cbc_encrypt,.-.L_vpaes_cbc_encrypt_begin + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #endif diff --git a/sys/crypto/openssl/i386/wp-mmx.S b/sys/crypto/openssl/i386/wp-mmx.S --- a/sys/crypto/openssl/i386/wp-mmx.S +++ b/sys/crypto/openssl/i386/wp-mmx.S @@ -6,6 +6,11 @@ .align 16 whirlpool_block_mmx: .L_whirlpool_block_mmx_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -1106,6 +1111,23 @@ .byte 251,238,124,102,221,23,71,158 .byte 202,45,191,7,173,90,131,51 .size whirlpool_block_mmx,.-.L_whirlpool_block_mmx_begin + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #else .text .globl whirlpool_block_mmx @@ -1113,6 +1135,11 @@ .align 16 whirlpool_block_mmx: .L_whirlpool_block_mmx_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -2213,4 +2240,21 @@ .byte 251,238,124,102,221,23,71,158 .byte 202,45,191,7,173,90,131,51 .size whirlpool_block_mmx,.-.L_whirlpool_block_mmx_begin + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #endif diff --git a/sys/crypto/openssl/i386/x86-gf2m.S b/sys/crypto/openssl/i386/x86-gf2m.S --- a/sys/crypto/openssl/i386/x86-gf2m.S +++ b/sys/crypto/openssl/i386/x86-gf2m.S @@ -4,6 +4,11 @@ .type _mul_1x1_mmx,@function .align 16 _mul_1x1_mmx: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + subl $36,%esp movl %eax,%ecx leal (%eax,%eax,1),%edx @@ -107,6 +112,11 @@ .type _mul_1x1_ialu,@function .align 16 _mul_1x1_ialu: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + subl $36,%esp movl %eax,%ecx leal (%eax,%eax,1),%edx @@ -241,6 +251,11 @@ .align 16 bn_GF2m_mul_2x2: .L_bn_GF2m_mul_2x2_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + call .L000PIC_me_up .L000PIC_me_up: popl %edx @@ -345,11 +360,33 @@ .byte 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103 .byte 62,0 .comm OPENSSL_ia32cap_P,16,4 + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #else .text .type _mul_1x1_mmx,@function .align 16 _mul_1x1_mmx: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + subl $36,%esp movl %eax,%ecx leal (%eax,%eax,1),%edx @@ -453,6 +490,11 @@ .type _mul_1x1_ialu,@function .align 16 _mul_1x1_ialu: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + subl $36,%esp movl %eax,%ecx leal (%eax,%eax,1),%edx @@ -587,6 +629,11 @@ .align 16 bn_GF2m_mul_2x2: .L_bn_GF2m_mul_2x2_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + leal OPENSSL_ia32cap_P,%edx movl (%edx),%eax movl 4(%edx),%edx @@ -688,4 +735,21 @@ .byte 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103 .byte 62,0 .comm OPENSSL_ia32cap_P,16,4 + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #endif diff --git a/sys/crypto/openssl/i386/x86-mont.S b/sys/crypto/openssl/i386/x86-mont.S --- a/sys/crypto/openssl/i386/x86-mont.S +++ b/sys/crypto/openssl/i386/x86-mont.S @@ -6,6 +6,11 @@ .align 16 bn_mul_mont: .L_bn_mul_mont_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -475,6 +480,23 @@ .byte 32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46 .byte 111,114,103,62,0 .comm OPENSSL_ia32cap_P,16,4 + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #else .text .globl bn_mul_mont @@ -482,6 +504,11 @@ .align 16 bn_mul_mont: .L_bn_mul_mont_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -948,4 +975,21 @@ .byte 32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46 .byte 111,114,103,62,0 .comm OPENSSL_ia32cap_P,16,4 + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #endif diff --git a/sys/crypto/openssl/i386/x86cpuid.S b/sys/crypto/openssl/i386/x86cpuid.S --- a/sys/crypto/openssl/i386/x86cpuid.S +++ b/sys/crypto/openssl/i386/x86cpuid.S @@ -6,6 +6,11 @@ .align 16 OPENSSL_ia32_cpuid: .L_OPENSSL_ia32_cpuid_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -150,6 +155,11 @@ .align 16 OPENSSL_rdtsc: .L_OPENSSL_rdtsc_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + xorl %eax,%eax xorl %edx,%edx call .L009PIC_me_up @@ -167,6 +177,11 @@ .align 16 OPENSSL_instrument_halt: .L_OPENSSL_instrument_halt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + call .L011PIC_me_up .L011PIC_me_up: popl %ecx @@ -199,6 +214,11 @@ .align 16 OPENSSL_far_spin: .L_OPENSSL_far_spin_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushfl popl %eax btl $9,%eax @@ -226,6 +246,11 @@ .align 16 OPENSSL_wipe_cpu: .L_OPENSSL_wipe_cpu_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + xorl %eax,%eax xorl %edx,%edx call .L015PIC_me_up @@ -257,6 +282,11 @@ .align 16 OPENSSL_atomic_add: .L_OPENSSL_atomic_add_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl 4(%esp),%edx movl 8(%esp),%ecx pushl %ebx @@ -276,6 +306,11 @@ .align 16 OPENSSL_cleanse: .L_OPENSSL_cleanse_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl 4(%esp),%edx movl 8(%esp),%ecx xorl %eax,%eax @@ -313,6 +348,11 @@ .align 16 CRYPTO_memcmp: .L_CRYPTO_memcmp_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %esi pushl %edi movl 12(%esp),%esi @@ -342,6 +382,11 @@ .align 16 OPENSSL_instrument_bus: .L_OPENSSL_instrument_bus_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -390,6 +435,11 @@ .align 16 OPENSSL_instrument_bus2: .L_OPENSSL_instrument_bus2_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -451,6 +501,11 @@ .align 16 OPENSSL_ia32_rdrand_bytes: .L_OPENSSL_ia32_rdrand_bytes_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %edi pushl %ebx xorl %eax,%eax @@ -494,6 +549,11 @@ .align 16 OPENSSL_ia32_rdseed_bytes: .L_OPENSSL_ia32_rdseed_bytes_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %edi pushl %ebx xorl %eax,%eax @@ -537,6 +597,23 @@ .comm OPENSSL_ia32cap_P,16,4 .section .init call OPENSSL_cpuid_setup + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #else .text .globl OPENSSL_ia32_cpuid @@ -544,6 +621,11 @@ .align 16 OPENSSL_ia32_cpuid: .L_OPENSSL_ia32_cpuid_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -688,6 +770,11 @@ .align 16 OPENSSL_rdtsc: .L_OPENSSL_rdtsc_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + xorl %eax,%eax xorl %edx,%edx leal OPENSSL_ia32cap_P,%ecx @@ -702,6 +789,11 @@ .align 16 OPENSSL_instrument_halt: .L_OPENSSL_instrument_halt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + leal OPENSSL_ia32cap_P,%ecx btl $4,(%ecx) jnc .L010nohalt @@ -731,6 +823,11 @@ .align 16 OPENSSL_far_spin: .L_OPENSSL_far_spin_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushfl popl %eax btl $9,%eax @@ -758,6 +855,11 @@ .align 16 OPENSSL_wipe_cpu: .L_OPENSSL_wipe_cpu_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + xorl %eax,%eax xorl %edx,%edx leal OPENSSL_ia32cap_P,%ecx @@ -786,6 +888,11 @@ .align 16 OPENSSL_atomic_add: .L_OPENSSL_atomic_add_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl 4(%esp),%edx movl 8(%esp),%ecx pushl %ebx @@ -805,6 +912,11 @@ .align 16 OPENSSL_cleanse: .L_OPENSSL_cleanse_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl 4(%esp),%edx movl 8(%esp),%ecx xorl %eax,%eax @@ -842,6 +954,11 @@ .align 16 CRYPTO_memcmp: .L_CRYPTO_memcmp_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %esi pushl %edi movl 12(%esp),%esi @@ -871,6 +988,11 @@ .align 16 OPENSSL_instrument_bus: .L_OPENSSL_instrument_bus_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -916,6 +1038,11 @@ .align 16 OPENSSL_instrument_bus2: .L_OPENSSL_instrument_bus2_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -974,6 +1101,11 @@ .align 16 OPENSSL_ia32_rdrand_bytes: .L_OPENSSL_ia32_rdrand_bytes_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %edi pushl %ebx xorl %eax,%eax @@ -1017,6 +1149,11 @@ .align 16 OPENSSL_ia32_rdseed_bytes: .L_OPENSSL_ia32_rdseed_bytes_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %edi pushl %ebx xorl %eax,%eax @@ -1060,4 +1197,21 @@ .comm OPENSSL_ia32cap_P,16,4 .section .init call OPENSSL_cpuid_setup + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #endif diff --git a/sys/crypto/openssl/powerpc/bn-ppc.S b/sys/crypto/openssl/powerpc/bn-ppc.S new file mode 100644 --- /dev/null +++ b/sys/crypto/openssl/powerpc/bn-ppc.S @@ -0,0 +1,1855 @@ +/* Do not modify. This file is auto-generated from ppc.pl. */ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +.globl bn_sqr_comba4 +.type bn_sqr_comba4,@function +.globl bn_sqr_comba8 +.type bn_sqr_comba8,@function +.globl bn_mul_comba4 +.type bn_mul_comba4,@function +.globl bn_mul_comba8 +.type bn_mul_comba8,@function +.globl bn_sub_words +.type bn_sub_words,@function +.globl bn_add_words +.type bn_add_words,@function +.globl bn_div_words +.type bn_div_words,@function +.globl bn_sqr_words +.type bn_sqr_words,@function +.globl bn_mul_words +.type bn_mul_words,@function +.globl bn_mul_add_words +.type bn_mul_add_words,@function + + + +.machine "any" +.text + + + + + + + + +.align 4 +bn_sqr_comba4: + + + + + + + + + + + + + + + + xor 0,0,0 + + + + lwz 5,0(4) + mullw 9,5,5 + mulhwu 10,5,5 + + + + + stw 9,0(3) + + lwz 6,4(4) + mullw 7,5,6 + mulhwu 8,5,6 + + addc 7,7,7 + adde 8,8,8 + addze 9,0 + + + addc 10,7,10 + addze 11,8 + addze 9,9 + + stw 10,4(3) + + mullw 7,6,6 + mulhwu 8,6,6 + addc 11,7,11 + adde 9,8,9 + addze 10,0 + + lwz 6,8(4) + mullw 7,5,6 + mulhwu 8,5,6 + + addc 7,7,7 + adde 8,8,8 + addze 10,10 + + addc 11,7,11 + adde 9,8,9 + addze 10,10 + stw 11,8(3) + + lwz 6,12(4) + mullw 7,5,6 + mulhwu 8,5,6 + addc 7,7,7 + adde 8,8,8 + addze 11,0 + + addc 9,7,9 + adde 10,8,10 + addze 11,11 + + lwz 5,4(4) + lwz 6,8(4) + mullw 7,5,6 + mulhwu 8,5,6 + + addc 7,7,7 + adde 8,8,8 + addze 11,11 + addc 9,7,9 + adde 10,8,10 + addze 11,11 + stw 9,12(3) + + mullw 7,6,6 + mulhwu 8,6,6 + addc 10,7,10 + adde 11,8,11 + addze 9,0 + + lwz 6,12(4) + mullw 7,5,6 + mulhwu 8,5,6 + addc 7,7,7 + adde 8,8,8 + addze 9,9 + + addc 10,7,10 + adde 11,8,11 + addze 9,9 + stw 10,16(3) + + lwz 5,8(4) + mullw 7,5,6 + mulhwu 8,5,6 + addc 7,7,7 + adde 8,8,8 + addze 10,0 + + addc 11,7,11 + adde 9,8,9 + addze 10,10 + stw 11,20(3) + + mullw 7,6,6 + mulhwu 8,6,6 + addc 9,7,9 + adde 10,8,10 + + stw 9,24(3) + stw 10,28(3) + blr +.long 0 +.byte 0,12,0x14,0,0,0,2,0 +.long 0 +.size bn_sqr_comba4,.-bn_sqr_comba4 + + + + + + + + +.align 4 +bn_sqr_comba8: + + + + + + + + + + + + + + + + + + + + xor 0,0,0 + + + + lwz 5,0(4) + mullw 9,5,5 + mulhwu 10,5,5 + stw 9,0(3) + + lwz 6,4(4) + mullw 7,5,6 + mulhwu 8,5,6 + + addc 10,7,10 + adde 11,8,0 + addze 9,0 + + addc 10,7,10 + adde 11,8,11 + addze 9,9 + + stw 10,4(3) + + + mullw 7,6,6 + mulhwu 8,6,6 + addc 11,7,11 + adde 9,8,9 + addze 10,0 + + lwz 6,8(4) + mullw 7,5,6 + mulhwu 8,5,6 + + addc 11,7,11 + adde 9,8,9 + addze 10,10 + + addc 11,7,11 + adde 9,8,9 + addze 10,10 + + stw 11,8(3) + + lwz 6,12(4) + mullw 7,5,6 + mulhwu 8,5,6 + + addc 9,7,9 + adde 10,8,10 + addze 11,0 + + addc 9,7,9 + adde 10,8,10 + addze 11,11 + + lwz 5,4(4) + lwz 6,8(4) + mullw 7,5,6 + mulhwu 8,5,6 + + addc 9,7,9 + adde 10,8,10 + addze 11,11 + + addc 9,7,9 + adde 10,8,10 + addze 11,11 + + stw 9,12(3) + + mullw 7,6,6 + mulhwu 8,6,6 + + addc 10,7,10 + adde 11,8,11 + addze 9,0 + + lwz 6,12(4) + mullw 7,5,6 + mulhwu 8,5,6 + + addc 10,7,10 + adde 11,8,11 + addze 9,9 + + addc 10,7,10 + adde 11,8,11 + addze 9,9 + + lwz 5,0(4) + lwz 6,16(4) + mullw 7,5,6 + mulhwu 8,5,6 + + addc 10,7,10 + adde 11,8,11 + addze 9,9 + + addc 10,7,10 + adde 11,8,11 + addze 9,9 + stw 10,16(3) + + lwz 6,20(4) + mullw 7,5,6 + mulhwu 8,5,6 + + addc 11,7,11 + adde 9,8,9 + addze 10,0 + + addc 11,7,11 + adde 9,8,9 + addze 10,10 + + lwz 5,4(4) + lwz 6,16(4) + mullw 7,5,6 + mulhwu 8,5,6 + + addc 11,7,11 + adde 9,8,9 + addze 10,10 + + addc 11,7,11 + adde 9,8,9 + addze 10,10 + + lwz 5,8(4) + lwz 6,12(4) + mullw 7,5,6 + mulhwu 8,5,6 + + addc 11,7,11 + adde 9,8,9 + addze 10,10 + + addc 11,7,11 + adde 9,8,9 + addze 10,10 + stw 11,20(3) + + mullw 7,6,6 + mulhwu 8,6,6 + addc 9,7,9 + adde 10,8,10 + addze 11,0 + + lwz 6,16(4) + mullw 7,5,6 + mulhwu 8,5,6 + + addc 9,7,9 + adde 10,8,10 + addze 11,11 + + addc 9,7,9 + adde 10,8,10 + addze 11,11 + + lwz 5,4(4) + lwz 6,20(4) + mullw 7,5,6 + mulhwu 8,5,6 + + addc 9,7,9 + adde 10,8,10 + addze 11,11 + + addc 9,7,9 + adde 10,8,10 + addze 11,11 + + lwz 5,0(4) + lwz 6,24(4) + mullw 7,5,6 + mulhwu 8,5,6 + addc 9,7,9 + adde 10,8,10 + addze 11,11 + addc 9,7,9 + adde 10,8,10 + addze 11,11 + stw 9,24(3) + + lwz 6,28(4) + mullw 7,5,6 + mulhwu 8,5,6 + + addc 10,7,10 + adde 11,8,11 + addze 9,0 + addc 10,7,10 + adde 11,8,11 + addze 9,9 + + lwz 5,4(4) + lwz 6,24(4) + mullw 7,5,6 + mulhwu 8,5,6 + + addc 10,7,10 + adde 11,8,11 + addze 9,9 + addc 10,7,10 + adde 11,8,11 + addze 9,9 + + lwz 5,8(4) + lwz 6,20(4) + mullw 7,5,6 + mulhwu 8,5,6 + addc 10,7,10 + adde 11,8,11 + addze 9,9 + addc 10,7,10 + adde 11,8,11 + addze 9,9 + + lwz 5,12(4) + lwz 6,16(4) + mullw 7,5,6 + mulhwu 8,5,6 + + addc 10,7,10 + adde 11,8,11 + addze 9,9 + addc 10,7,10 + adde 11,8,11 + addze 9,9 + stw 10,28(3) + + mullw 7,6,6 + mulhwu 8,6,6 + addc 11,7,11 + adde 9,8,9 + addze 10,0 + + lwz 6,20(4) + mullw 7,5,6 + mulhwu 8,5,6 + addc 11,7,11 + adde 9,8,9 + addze 10,10 + addc 11,7,11 + adde 9,8,9 + addze 10,10 + + lwz 5,8(4) + lwz 6,24(4) + mullw 7,5,6 + mulhwu 8,5,6 + addc 11,7,11 + adde 9,8,9 + addze 10,10 + + addc 11,7,11 + adde 9,8,9 + addze 10,10 + + lwz 5,4(4) + lwz 6,28(4) + mullw 7,5,6 + mulhwu 8,5,6 + addc 11,7,11 + adde 9,8,9 + addze 10,10 + addc 11,7,11 + adde 9,8,9 + addze 10,10 + stw 11,32(3) + + lwz 5,8(4) + mullw 7,5,6 + mulhwu 8,5,6 + + addc 9,7,9 + adde 10,8,10 + addze 11,0 + addc 9,7,9 + adde 10,8,10 + addze 11,11 + + lwz 5,12(4) + lwz 6,24(4) + mullw 7,5,6 + mulhwu 8,5,6 + addc 9,7,9 + adde 10,8,10 + addze 11,11 + addc 9,7,9 + adde 10,8,10 + addze 11,11 + + lwz 5,16(4) + lwz 6,20(4) + mullw 7,5,6 + mulhwu 8,5,6 + addc 9,7,9 + adde 10,8,10 + addze 11,11 + addc 9,7,9 + adde 10,8,10 + addze 11,11 + stw 9,36(3) + + mullw 7,6,6 + mulhwu 8,6,6 + addc 10,7,10 + adde 11,8,11 + addze 9,0 + + lwz 6,24(4) + mullw 7,5,6 + mulhwu 8,5,6 + addc 10,7,10 + adde 11,8,11 + addze 9,9 + addc 10,7,10 + adde 11,8,11 + addze 9,9 + + lwz 5,12(4) + lwz 6,28(4) + mullw 7,5,6 + mulhwu 8,5,6 + addc 10,7,10 + adde 11,8,11 + addze 9,9 + addc 10,7,10 + adde 11,8,11 + addze 9,9 + stw 10,40(3) + + lwz 5,16(4) + mullw 7,5,6 + mulhwu 8,5,6 + addc 11,7,11 + adde 9,8,9 + addze 10,0 + addc 11,7,11 + adde 9,8,9 + addze 10,10 + + lwz 5,20(4) + lwz 6,24(4) + mullw 7,5,6 + mulhwu 8,5,6 + addc 11,7,11 + adde 9,8,9 + addze 10,10 + addc 11,7,11 + adde 9,8,9 + addze 10,10 + stw 11,44(3) + + mullw 7,6,6 + mulhwu 8,6,6 + addc 9,7,9 + adde 10,8,10 + addze 11,0 + + lwz 6,28(4) + mullw 7,5,6 + mulhwu 8,5,6 + addc 9,7,9 + adde 10,8,10 + addze 11,11 + addc 9,7,9 + adde 10,8,10 + addze 11,11 + stw 9,48(3) + + + lwz 5,24(4) + mullw 7,5,6 + mulhwu 8,5,6 + addc 10,7,10 + adde 11,8,11 + addze 9,0 + addc 10,7,10 + adde 11,8,11 + addze 9,9 + stw 10,52(3) + + mullw 7,6,6 + mulhwu 8,6,6 + addc 11,7,11 + adde 9,8,9 + stw 11,56(3) + stw 9, 60(3) + + + blr +.long 0 +.byte 0,12,0x14,0,0,0,2,0 +.long 0 +.size bn_sqr_comba8,.-bn_sqr_comba8 + + + + + + + + +.align 4 +bn_mul_comba4: + + + + + + + + + + + + xor 0,0,0 + + lwz 6,0(4) + lwz 7,0(5) + mullw 10,6,7 + mulhwu 11,6,7 + stw 10,0(3) + + lwz 7,4(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 11,8,11 + adde 12,9,0 + addze 10,0 + + lwz 6, 4(4) + lwz 7, 0(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 11,8,11 + adde 12,9,12 + addze 10,10 + stw 11,4(3) + + lwz 6,8(4) + mullw 8,6,7 + mulhwu 9,6,7 + addc 12,8,12 + adde 10,9,10 + addze 11,0 + + lwz 6,4(4) + lwz 7,4(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 12,8,12 + adde 10,9,10 + addze 11,11 + + lwz 6,0(4) + lwz 7,8(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 12,8,12 + adde 10,9,10 + addze 11,11 + stw 12,8(3) + + lwz 7,12(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 10,8,10 + adde 11,9,11 + addze 12,0 + + lwz 6,4(4) + lwz 7,8(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 10,8,10 + adde 11,9,11 + addze 12,12 + + lwz 6,8(4) + lwz 7,4(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 10,8,10 + adde 11,9,11 + addze 12,12 + + lwz 6,12(4) + lwz 7,0(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 10,8,10 + adde 11,9,11 + addze 12,12 + stw 10,12(3) + + lwz 7,4(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 11,8,11 + adde 12,9,12 + addze 10,0 + + lwz 6,8(4) + lwz 7,8(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 11,8,11 + adde 12,9,12 + addze 10,10 + + lwz 6,4(4) + lwz 7,12(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 11,8,11 + adde 12,9,12 + addze 10,10 + stw 11,16(3) + + lwz 6,8(4) + mullw 8,6,7 + mulhwu 9,6,7 + addc 12,8,12 + adde 10,9,10 + addze 11,0 + + lwz 6,12(4) + lwz 7,8(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 12,8,12 + adde 10,9,10 + addze 11,11 + stw 12,20(3) + + lwz 7,12(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 10,8,10 + adde 11,9,11 + + stw 10,24(3) + stw 11,28(3) + blr +.long 0 +.byte 0,12,0x14,0,0,0,3,0 +.long 0 +.size bn_mul_comba4,.-bn_mul_comba4 + + + + + + + + +.align 4 +bn_mul_comba8: + + + + + + + + + + + + xor 0,0,0 + + + lwz 6,0(4) + lwz 7,0(5) + mullw 10,6,7 + mulhwu 11,6,7 + stw 10,0(3) + + lwz 7,4(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 11,11,8 + addze 12,9 + addze 10,0 + + lwz 6,4(4) + lwz 7,0(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + stw 11,4(3) + + lwz 6,8(4) + mullw 8,6,7 + mulhwu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,0 + + lwz 6,4(4) + lwz 7,4(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + + lwz 6,0(4) + lwz 7,8(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + stw 12,8(3) + + lwz 7,12(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,0 + + lwz 6,4(4) + lwz 7,8(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + + + lwz 6,8(4) + lwz 7,4(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + + lwz 6,12(4) + lwz 7,0(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + stw 10,12(3) + + lwz 6,16(4) + mullw 8,6,7 + mulhwu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,0 + + lwz 6,12(4) + lwz 7,4(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + + lwz 6,8(4) + lwz 7,8(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + + lwz 6,4(4) + lwz 7,12(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + + lwz 6,0(4) + lwz 7,16(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + stw 11,16(3) + + lwz 7,20(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,0 + + lwz 6,4(4) + lwz 7,16(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + + lwz 6,8(4) + lwz 7,12(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + + lwz 6,12(4) + lwz 7,8(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + + lwz 6,16(4) + lwz 7,4(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + + lwz 6,20(4) + lwz 7,0(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + stw 12,20(3) + + lwz 6,24(4) + mullw 8,6,7 + mulhwu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,0 + + lwz 6,20(4) + lwz 7,4(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + + lwz 6,16(4) + lwz 7,8(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + + lwz 6,12(4) + lwz 7,12(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + + lwz 6,8(4) + lwz 7,16(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + + lwz 6,4(4) + lwz 7,20(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + + lwz 6,0(4) + lwz 7,24(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + stw 10,24(3) + + lwz 7,28(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,0 + + lwz 6,4(4) + lwz 7,24(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + + lwz 6,8(4) + lwz 7,20(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + + lwz 6,12(4) + lwz 7,16(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + + lwz 6,16(4) + lwz 7,12(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + + lwz 6,20(4) + lwz 7,8(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + + lwz 6,24(4) + lwz 7,4(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + + lwz 6,28(4) + lwz 7,0(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + stw 11,28(3) + + lwz 7,4(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,0 + + lwz 6,24(4) + lwz 7,8(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + + lwz 6,20(4) + lwz 7,12(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + + lwz 6,16(4) + lwz 7,16(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + + lwz 6,12(4) + lwz 7,20(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + + lwz 6,8(4) + lwz 7,24(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + + lwz 6,4(4) + lwz 7,28(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + stw 12,32(3) + + lwz 6,8(4) + mullw 8,6,7 + mulhwu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,0 + + lwz 6,12(4) + lwz 7,24(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + + lwz 6,16(4) + lwz 7,20(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + + lwz 6,20(4) + lwz 7,16(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + + lwz 6,24(4) + lwz 7,12(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + + lwz 6,28(4) + lwz 7,8(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + stw 10,36(3) + + lwz 7,12(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,0 + + lwz 6,24(4) + lwz 7,16(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + + lwz 6,20(4) + lwz 7,20(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + + lwz 6,16(4) + lwz 7,24(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + + lwz 6,12(4) + lwz 7,28(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + stw 11,40(3) + + lwz 6,16(4) + mullw 8,6,7 + mulhwu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,0 + + lwz 6,20(4) + lwz 7,24(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + + lwz 6,24(4) + lwz 7,20(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + + lwz 6,28(4) + lwz 7,16(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + stw 12,44(3) + + lwz 7,20(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,0 + + lwz 6,24(4) + lwz 7,24(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + + lwz 6,20(4) + lwz 7,28(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + stw 10,48(3) + + lwz 6,24(4) + mullw 8,6,7 + mulhwu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,0 + + lwz 6,28(4) + lwz 7,24(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + stw 11,52(3) + + lwz 7,28(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 12,12,8 + adde 10,10,9 + stw 12,56(3) + stw 10,60(3) + blr +.long 0 +.byte 0,12,0x14,0,0,0,3,0 +.long 0 +.size bn_mul_comba8,.-bn_mul_comba8 + + + + + + + + +.align 4 +bn_sub_words: + + + + + + + + + + + + + + xor 0,0,0 + + + + subfc. 7,0,6 + + + beq .Lppcasm_sub_adios + addi 4,4,-4 + addi 3,3,-4 + addi 5,5,-4 + mtctr 6 +.Lppcasm_sub_mainloop: + lwzu 7,4(4) + lwzu 8,4(5) + subfe 6,8,7 + + + stwu 6,4(3) + bdnz .Lppcasm_sub_mainloop +.Lppcasm_sub_adios: + subfze 3,0 + andi. 3,3,1 + blr +.long 0 +.byte 0,12,0x14,0,0,0,4,0 +.long 0 +.size bn_sub_words,.-bn_sub_words + + + + + + + + +.align 4 +bn_add_words: + + + + + + + + + + + + + + xor 0,0,0 + + + + addic. 6,6,0 + beq .Lppcasm_add_adios + addi 4,4,-4 + addi 3,3,-4 + addi 5,5,-4 + mtctr 6 +.Lppcasm_add_mainloop: + lwzu 7,4(4) + lwzu 8,4(5) + adde 8,7,8 + stwu 8,4(3) + bdnz .Lppcasm_add_mainloop +.Lppcasm_add_adios: + addze 3,0 + blr +.long 0 +.byte 0,12,0x14,0,0,0,4,0 +.long 0 +.size bn_add_words,.-bn_add_words + + + + + + + + +.align 4 +bn_div_words: + + + + + + + + + + + + cmplwi 0,5,0 + bne .Lppcasm_div1 + li 3,-1 + blr +.Lppcasm_div1: + xor 0,0,0 + li 8,32 + cntlzw. 7,5 + beq .Lppcasm_div2 + subf 8,7,8 + srw. 9,3,8 + tw 16,9,0 +.Lppcasm_div2: + .long 0x7c032840 + blt .Lppcasm_div3 + subf 3,5,3 +.Lppcasm_div3: + cmpi 0,0,7,0 + beq .Lppcasm_div4 + slw 3,3,7 + srw 8,4,8 + slw 5,5,7 + or 3,3,8 + slw 4,4,7 +.Lppcasm_div4: + srwi 9,5,16 + + + li 6,2 + mtctr 6 +.Lppcasm_divouterloop: + srwi 8,3,16 + srwi 11,4,16 + + .long 0x7c084840 + bne .Lppcasm_div5 + + li 8,-1 + clrlwi 8,8,16 + b .Lppcasm_div6 +.Lppcasm_div5: + divwu 8,3,9 +.Lppcasm_div6: + mullw 12,9,8 + clrlwi 10,5,16 + mullw 6,8,10 + +.Lppcasm_divinnerloop: + subf 10,12,3 + srwi 7,10,16 + addic. 7,7,0 + + + + slwi 7,10,16 + or 7,7,11 + .long 0x7c863840 + bne .Lppcasm_divinnerexit + ble 1,.Lppcasm_divinnerexit + addi 8,8,-1 + subf 12,9,12 + clrlwi 10,5,16 + subf 6,10,6 + b .Lppcasm_divinnerloop +.Lppcasm_divinnerexit: + srwi 10,6,16 + slwi 11,6,16 + .long 0x7c845840 + add 12,12,10 + bge 1,.Lppcasm_div7 + addi 12,12,1 +.Lppcasm_div7: + subf 11,11,4 + .long 0x7c836040 + bge 1,.Lppcasm_div8 + addi 8,8,-1 + add 3,5,3 +.Lppcasm_div8: + subf 12,12,3 + slwi 4,11,16 + + + + insrwi 11,12,16,16 + rotlwi 3,11,16 + bdz .Lppcasm_div9 + slwi 0,8,16 + b .Lppcasm_divouterloop +.Lppcasm_div9: + or 3,8,0 + blr +.long 0 +.byte 0,12,0x14,0,0,0,3,0 +.long 0 +.size bn_div_words,.-bn_div_words + + + + + + + +.align 4 +bn_sqr_words: + + + + + + + + + + + + + + + addic. 5,5,0 + beq .Lppcasm_sqr_adios + addi 4,4,-4 + addi 3,3,-4 + mtctr 5 +.Lppcasm_sqr_mainloop: + + lwzu 6,4(4) + mullw 7,6,6 + mulhwu 8,6,6 + stwu 7,4(3) + stwu 8,4(3) + bdnz .Lppcasm_sqr_mainloop +.Lppcasm_sqr_adios: + blr +.long 0 +.byte 0,12,0x14,0,0,0,3,0 +.long 0 +.size bn_sqr_words,.-bn_sqr_words + + + + + + + + +.align 4 +bn_mul_words: + + + + + + + + xor 0,0,0 + xor 12,12,12 + rlwinm. 7,5,30,2,31 + beq .Lppcasm_mw_REM + mtctr 7 +.Lppcasm_mw_LOOP: + + lwz 8,0(4) + mullw 9,6,8 + mulhwu 10,6,8 + addc 9,9,12 + + + + + stw 9,0(3) + + lwz 8,4(4) + mullw 11,6,8 + mulhwu 12,6,8 + adde 11,11,10 + + stw 11,4(3) + + lwz 8,8(4) + mullw 9,6,8 + mulhwu 10,6,8 + adde 9,9,12 + + stw 9,8(3) + + lwz 8,12(4) + mullw 11,6,8 + mulhwu 12,6,8 + adde 11,11,10 + addze 12,12 + + stw 11,12(3) + + addi 3,3,16 + addi 4,4,16 + bdnz .Lppcasm_mw_LOOP + +.Lppcasm_mw_REM: + andi. 5,5,0x3 + beq .Lppcasm_mw_OVER + + lwz 8,0(4) + mullw 9,6,8 + mulhwu 10,6,8 + addc 9,9,12 + addze 10,10 + stw 9,0(3) + addi 12,10,0 + + addi 5,5,-1 + cmpli 0,0,5,0 + beq .Lppcasm_mw_OVER + + + + lwz 8,4(4) + mullw 9,6,8 + mulhwu 10,6,8 + addc 9,9,12 + addze 10,10 + stw 9,4(3) + addi 12,10,0 + + addi 5,5,-1 + cmpli 0,0,5,0 + beq .Lppcasm_mw_OVER + + + lwz 8,8(4) + mullw 9,6,8 + mulhwu 10,6,8 + addc 9,9,12 + addze 10,10 + stw 9,8(3) + addi 12,10,0 + +.Lppcasm_mw_OVER: + addi 3,12,0 + blr +.long 0 +.byte 0,12,0x14,0,0,0,4,0 +.long 0 +.size bn_mul_words,.-bn_mul_words + + + + + + + + +.align 4 +bn_mul_add_words: + + + + + + + + + + + xor 0,0,0 + xor 12,12,12 + rlwinm. 7,5,30,2,31 + beq .Lppcasm_maw_leftover + mtctr 7 +.Lppcasm_maw_mainloop: + + lwz 8,0(4) + lwz 11,0(3) + mullw 9,6,8 + mulhwu 10,6,8 + addc 9,9,12 + addze 10,10 + addc 9,9,11 + + + + + + + stw 9,0(3) + + + lwz 8,4(4) + lwz 9,4(3) + mullw 11,6,8 + mulhwu 12,6,8 + adde 11,11,10 + addze 12,12 + addc 11,11,9 + + stw 11,4(3) + + + lwz 8,8(4) + mullw 9,6,8 + lwz 11,8(3) + mulhwu 10,6,8 + adde 9,9,12 + addze 10,10 + addc 9,9,11 + + stw 9,8(3) + + + lwz 8,12(4) + mullw 11,6,8 + lwz 9,12(3) + mulhwu 12,6,8 + adde 11,11,10 + addze 12,12 + addc 11,11,9 + addze 12,12 + stw 11,12(3) + addi 3,3,16 + addi 4,4,16 + bdnz .Lppcasm_maw_mainloop + +.Lppcasm_maw_leftover: + andi. 5,5,0x3 + beq .Lppcasm_maw_adios + addi 3,3,-4 + addi 4,4,-4 + + mtctr 5 + lwzu 8,4(4) + mullw 9,6,8 + mulhwu 10,6,8 + lwzu 11,4(3) + addc 9,9,11 + addze 10,10 + addc 9,9,12 + addze 12,10 + stw 9,0(3) + + bdz .Lppcasm_maw_adios + + lwzu 8,4(4) + mullw 9,6,8 + mulhwu 10,6,8 + lwzu 11,4(3) + addc 9,9,11 + addze 10,10 + addc 9,9,12 + addze 12,10 + stw 9,0(3) + + bdz .Lppcasm_maw_adios + + lwzu 8,4(4) + mullw 9,6,8 + mulhwu 10,6,8 + lwzu 11,4(3) + addc 9,9,11 + addze 10,10 + addc 9,9,12 + addze 12,10 + stw 9,0(3) + +.Lppcasm_maw_adios: + addi 3,12,0 + blr +.long 0 +.byte 0,12,0x14,0,0,0,4,0 +.long 0 +.size bn_mul_add_words,.-bn_mul_add_words +.align 4 diff --git a/sys/crypto/openssl/powerpc/poly1305-ppc.S b/sys/crypto/openssl/powerpc/poly1305-ppc.S --- a/sys/crypto/openssl/powerpc/poly1305-ppc.S +++ b/sys/crypto/openssl/powerpc/poly1305-ppc.S @@ -11,6 +11,7 @@ stw 0,8(3) stw 0,12(3) stw 0,16(3) + stw 0,24(3) .long 0x7c040040 beq- .Lno_key @@ -46,6 +47,7 @@ .type poly1305_blocks,@function .align 4 poly1305_blocks: +.Lpoly1305_blocks: srwi. 5,5,4 beq- .Labort @@ -243,70 +245,1057 @@ .long 0 .byte 0,12,4,1,0x80,18,4,0 .size poly1305_blocks,.-poly1305_blocks - .globl poly1305_emit .type poly1305_emit,@function -.align 4 +.align 5 poly1305_emit: - stwu 1,-96(1) - mflr 0 - stw 28,80(1) - stw 29,84(1) - stw 30,88(1) - stw 31,92(1) - stw 0,100(1) + lwz 0,24(3) + lwz 6,0(3) + lwz 7,4(3) + lwz 8,8(3) + lwz 9,12(3) + lwz 10,16(3) + cmplwi 0,0 + beq .Lemit_base2_32 - lwz 7,0(3) - lwz 8,4(3) - lwz 9,8(3) - lwz 10,12(3) - lwz 11,16(3) + slwi 11,7,26 + srwi 7,7,6 + slwi 12,8,20 + srwi 8,8,12 + addc 6,6,11 + slwi 11,9,14 + srwi 9,9,18 + adde 7,7,12 + slwi 12,10,8 + srwi 10,10,24 + adde 8,8,11 + adde 9,9,12 + addze 10,10 - addic 28,7,5 - addze 29,8 - addze 30,9 - addze 31,10 - addze 0,11 +.Lemit_base2_32: + addic 0,6,5 + addze 0,7 + addze 0,8 + addze 0,9 + addze 0,10 srwi 0,0,2 neg 0,0 + andi. 0,0,5 + + addc 6,6,0 + lwz 0,0(5) + addze 7,7 + lwz 11,4(5) + addze 8,8 + lwz 12,8(5) + addze 9,9 + lwz 10,12(5) + + addc 6,6,0 + adde 7,7,11 + adde 8,8,12 + adde 9,9,10 + + addi 3,4,-1 + addi 4,4,7 + + stbu 6,1(3) + srwi 6,6,8 + stbu 8,1(4) + srwi 8,8,8 + + stbu 6,1(3) + srwi 6,6,8 + stbu 8,1(4) + srwi 8,8,8 + + stbu 6,1(3) + srwi 6,6,8 + stbu 8,1(4) + srwi 8,8,8 + + stbu 6,1(3) + stbu 8,1(4) + + stbu 7,1(3) + srwi 7,7,8 + stbu 9,1(4) + srwi 9,9,8 + + stbu 7,1(3) + srwi 7,7,8 + stbu 9,1(4) + srwi 9,9,8 + + stbu 7,1(3) + srwi 7,7,8 + stbu 9,1(4) + srwi 9,9,8 + + stbu 7,1(3) + stbu 9,1(4) - andc 7,7,0 - and 28,28,0 - andc 8,8,0 - and 29,29,0 - or 7,7,28 - lwz 28,0(5) - andc 9,9,0 - and 30,30,0 - or 8,8,29 - lwz 29,4(5) - andc 10,10,0 - and 31,31,0 - or 9,9,30 - lwz 30,8(5) - or 10,10,31 - lwz 31,12(5) - - addc 7,7,28 - adde 8,8,29 - adde 9,9,30 - adde 10,10,31 - li 29,4 - stwbrx 7,0,4 - li 30,8 - stwbrx 8,29,4 - li 31,12 - stwbrx 9,30,4 - stwbrx 10,31,4 - lwz 28,80(1) - lwz 29,84(1) - lwz 30,88(1) - lwz 31,92(1) - addi 1,1,96 blr .long 0 -.byte 0,12,4,1,0x80,4,3,0 +.byte 0,12,0x14,0,0,0,3,0 .size poly1305_emit,.-poly1305_emit -.byte 80,111,108,121,49,51,48,53,32,102,111,114,32,80,80,67,44,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.globl poly1305_blocks_vsx +.type poly1305_blocks_vsx,@function +.align 5 +poly1305_blocks_vsx: + lwz 7,24(3) + cmplwi 5,128 + bge __poly1305_blocks_vsx + cmplwi 7,0 + beq .Lpoly1305_blocks + + lwz 7,0(3) + lwz 8,4(3) + lwz 9,8(3) + lwz 10,12(3) + lwz 11,16(3) + + slwi 0,8,26 + srwi 8,8,6 + slwi 12,9,20 + srwi 9,9,12 + addc 7,7,0 + slwi 0,10,14 + srwi 10,10,18 + adde 8,8,12 + slwi 12,11,8 + srwi 11,11,24 + adde 9,9,0 + li 0,0 + adde 10,10,12 + addze 11,11 + + stw 7,0(3) + stw 8,4(3) + stw 9,8(3) + stw 10,12(3) + stw 11,16(3) + stw 0,24(3) + + b .Lpoly1305_blocks +.long 0 +.byte 0,12,0x14,0,0,0,4,0 +.size poly1305_blocks_vsx,.-poly1305_blocks_vsx + +.align 5 +__poly1305_mul: + .long 0x11E05088 + .long 0x12015088 + .long 0x12225088 + .long 0x12435088 + .long 0x12645088 + + .long 0x12846088 + .long 0x11EFA0C0 + .long 0x12805888 + .long 0x1210A0C0 + .long 0x12815888 + .long 0x1231A0C0 + .long 0x12825888 + .long 0x1252A0C0 + .long 0x12835888 + .long 0x1273A0C0 + + .long 0x12837088 + .long 0x11EFA0C0 + .long 0x12847088 + .long 0x1210A0C0 + .long 0x12806888 + .long 0x1231A0C0 + .long 0x12816888 + .long 0x1252A0C0 + .long 0x12826888 + .long 0x1273A0C0 + + .long 0x12823888 + .long 0x11EFA0C0 + .long 0x12833888 + .long 0x1210A0C0 + .long 0x12843888 + .long 0x1231A0C0 + .long 0x12803088 + .long 0x1252A0C0 + .long 0x12813088 + .long 0x1273A0C0 + + .long 0x12814888 + .long 0x11EFA0C0 + .long 0x12824888 + .long 0x1210A0C0 + .long 0x12834888 + .long 0x1231A0C0 + .long 0x12844888 + .long 0x1252A0C0 + .long 0x12804088 + .long 0x1273A0C0 + + + + + vspltisb 20,2 + .long 0x1092CEC4 + .long 0x102FCEC4 + vand 3,18,29 + vand 0,15,29 + .long 0x108498C0 + .long 0x102180C0 + + .long 0x1264CEC4 + .long 0x1201CEC4 + vand 4,4,29 + vand 1,1,29 + .long 0x100098C0 + .long 0x105180C0 + + .long 0x1273A5C4 + .long 0x1222CEC4 + vand 2,2,29 + .long 0x100098C0 + .long 0x106388C0 + + .long 0x11E0CEC4 + .long 0x1243CEC4 + vand 0,0,29 + vand 3,3,29 + .long 0x102178C0 + .long 0x108490C0 + + blr +.long 0 +.byte 0,12,0x14,0,0,0,0,0 +.size __poly1305_mul,.-__poly1305_mul + +.align 5 +__poly1305_blocks_vsx: + stwu 1,-384(1) + mflr 0 + li 10,167 + li 11,183 + mfspr 12,256 + stvx 20,10,1 + addi 10,10,32 + stvx 21,11,1 + addi 11,11,32 + stvx 22,10,1 + addi 10,10,32 + stvx 23,10,1 + addi 10,10,32 + stvx 24,11,1 + addi 11,11,32 + stvx 25,10,1 + addi 10,10,32 + stvx 26,10,1 + addi 10,10,32 + stvx 27,11,1 + addi 11,11,32 + stvx 28,10,1 + addi 10,10,32 + stvx 29,11,1 + addi 11,11,32 + stvx 30,10,1 + stvx 31,11,1 + stw 12,360(1) + li 12,-1 + mtspr 256,12 + stw 27,364(1) + stw 28,368(1) + stw 29,372(1) + stw 30,376(1) + stw 31,380(1) + stw 0,388(1) + + bl .LPICmeup + + li 27,0x10 + li 28,0x20 + li 29,0x30 + li 30,0x40 + li 31,0x50 + .long 0x7FA06699 + .long 0x7F3B6699 + .long 0x7F7C6699 + .long 0x7FFD6699 + .long 0x7FDE6699 + + cmplwi 7,0 + bne .Lskip_init_vsx + + lwz 8,32(3) + lwz 9,36(3) + lwz 10,40(3) + lwz 11,44(3) + + extrwi 7,8,26,6 + extrwi 8,8,6,0 + insrwi 8,9,20,6 + extrwi 9,9,12,0 + insrwi 9,10,14,6 + extrwi 10,10,18,0 + insrwi 10,11,8,6 + extrwi 11,11,24,0 + + .long 0x7D4701E7 + slwi 7,8,2 + .long 0x7D6801E7 + add 8,8,7 + .long 0x7D8801E7 + slwi 8,9,2 + .long 0x7DA901E7 + add 9,9,8 + .long 0x7DC901E7 + slwi 9,10,2 + .long 0x7CCA01E7 + add 10,10,9 + .long 0x7CEA01E7 + slwi 10,11,2 + .long 0x7D0B01E7 + add 11,11,10 + .long 0x7D2B01E7 + + vor 0,10,10 + vor 1,11,11 + vor 2,13,13 + vor 3,6,6 + vor 4,8,8 + + bl __poly1305_mul + + .long 0xF1405057 + .long 0xF1615857 + .long 0xF1A26857 + .long 0xF0C33057 + .long 0xF1044057 + .long 0xF0000057 + .long 0xF0210857 + .long 0xF0421057 + .long 0xF0631857 + .long 0xF0842057 + .long 0x118BA5C4 + .long 0x11CDA5C4 + .long 0x10E6A5C4 + .long 0x1128A5C4 + .long 0x118C58C0 + .long 0x11CE68C0 + .long 0x10E730C0 + .long 0x112940C0 + + bl __poly1305_mul + + addi 7,3,0x60 + lwz 8,0(3) + lwz 9,4(3) + lwz 10,8(3) + lwz 11,12(3) + lwz 0,16(3) + + .long 0x114A068C + .long 0x116B0E8C + .long 0x11AD168C + .long 0x10C61E8C + .long 0x1108268C + vslw 12,11,20 + vslw 14,13,20 + vslw 7,6,20 + vslw 9,8,20 + vadduwm 12,12,11 + vadduwm 14,14,13 + vadduwm 7,7,6 + vadduwm 9,9,8 + + .long 0x7D5D1F99 + .long 0x7D7E1F99 + .long 0x7D9F1F99 + .long 0x7DA03F99 + .long 0x7DDB3F99 + .long 0x7CDC3F99 + .long 0x7CFD3F99 + .long 0x7D1E3F99 + .long 0x7D3F3F99 + + extrwi 7,8,26,6 + extrwi 8,8,6,0 + .long 0x7C0701E7 + insrwi 8,9,20,6 + extrwi 9,9,12,0 + .long 0x7C2801E7 + insrwi 9,10,14,6 + extrwi 10,10,18,0 + .long 0x7C4901E7 + insrwi 10,11,8,6 + extrwi 11,11,24,0 + .long 0x7C6A01E7 + insrwi 11,0,3,5 + .long 0x7C8B01E7 + li 0,1 + stw 0,24(3) + b .Loaded_vsx + +.align 4 +.Lskip_init_vsx: + li 27,4 + li 28,8 + li 29,12 + li 30,16 + .long 0x7C001819 + .long 0x7C3B1819 + .long 0x7C5C1819 + .long 0x7C7D1819 + .long 0x7C9E1819 + +.Loaded_vsx: + li 27,0x10 + li 28,0x20 + li 29,0x30 + li 30,0x40 + li 31,0x50 + li 7,0x60 + li 8,0x70 + addi 10,3,64 + addi 11,1,39 + + vxor 20,20,20 + .long 0xF000A057 + .long 0xF021A057 + .long 0xF042A057 + .long 0xF063A057 + .long 0xF084A057 + + .long 0x7F5F6699 + .long 0x7EA02699 + .long 0x7EDB2699 + .long 0x7EFC2699 + .long 0x7F1D2699 + vperm 21,21,21,26 + vperm 22,22,22,26 + vperm 23,23,23,26 + vperm 24,24,24,26 + + .long 0xF0B5B057 + vspltisb 26,4 + vperm 7,21,22,31 + vspltisb 28,14 + .long 0xF115B357 + + .long 0x10C5CEC4 + .long 0x10E7D6C4 + .long 0x1128DEC4 + .long 0x1108E6C4 + vand 5,5,29 + vand 6,6,29 + vand 7,7,29 + vand 8,8,29 + + .long 0xF2B7C057 + vperm 22,23,24,31 + .long 0xF2F7C357 + + .long 0x1295CEC4 + .long 0x12D6D6C4 + .long 0x1317DEC4 + .long 0x12F7E6C4 + vand 21,21,29 + vand 20,20,29 + vand 22,22,29 + vand 23,23,29 + + + .long 0x11384E8C + .long 0x10B52E8C + .long 0x10D4368C + .long 0x10F63E8C + .long 0x1117468C + vor 9,9,30 + + .long 0x7D5D1A99 + .long 0x7D605299 + .long 0x7D9B5299 + .long 0x7DBC5299 + .long 0x7DDD5299 + .long 0x7EBE5299 + .long 0x7EDF5299 + .long 0x7EE75299 + .long 0x7F085299 + stvx 11,0,11 + stvx 12,27,11 + stvx 13,28,11 + stvx 14,29,11 + stvx 21,30,11 + stvx 22,31,11 + stvx 23,7,11 + stvx 24,8,11 + + addi 4,4,0x40 + addi 12,12,0x50 + addi 0,5,-64 + srdi 0,0,6 + mtctr 0 + b .Loop_vsx + +.align 4 +.Loop_vsx: + + + + + + + + + + + + + + + .long 0x11E55288 + .long 0x12055A88 + .long 0x12256A88 + .long 0x12466A88 + + .long 0x12865288 + .long 0x1210A0C0 + .long 0x12865A88 + .long 0x1231A0C0 + .long 0x12676A88 + .long 0x12896288 + .long 0x11EFA0C0 + .long 0x12875A88 + .long 0x1252A0C0 + lvx 12,31,11 + .long 0x12885A88 + .long 0x1273A0C0 + lvx 11,30,11 + + .long 0x104238C0 + .long 0x100028C0 + .long 0x106340C0 + .long 0x102130C0 + .long 0x108448C0 + + .long 0x12887288 + .long 0x11EFA0C0 + .long 0x12897288 + .long 0x1210A0C0 + .long 0x12875288 + .long 0x1231A0C0 + .long 0x12885288 + .long 0x1252A0C0 + lvx 14,8,11 + .long 0x12895288 + .long 0x1273A0C0 + lvx 13,7,11 + + .long 0x12876288 + .long 0x11EFA0C0 + .long 0x12886288 + .long 0x1210A0C0 + .long 0x12896288 + .long 0x1231A0C0 + .long 0x12855A88 + .long 0x1252A0C0 + .long 0x12865A88 + .long 0x1273A0C0 + + .long 0x7F406699 + .long 0x7EA02699 + .long 0x7EDB2699 + .long 0x7EFC2699 + .long 0x7F1D2699 + vperm 21,21,21,26 + vperm 22,22,22,26 + vperm 23,23,23,26 + vperm 24,24,24,26 + + .long 0x12867288 + .long 0x11EFA0C0 + .long 0x12877288 + .long 0x1210A0C0 + .long 0x12887288 + .long 0x1231A0C0 + .long 0x12897288 + .long 0x1252A0C0 + .long 0x12856A88 + .long 0x1273A0C0 + + .long 0xF0B5B057 + vspltisb 26,4 + vperm 7,21,22,31 + .long 0xF115B357 + + + .long 0x12805088 + .long 0x11EFA0C0 + .long 0x12815088 + .long 0x1210A0C0 + .long 0x12825088 + .long 0x1231A0C0 + .long 0x12835088 + .long 0x1252A0C0 + .long 0x12845088 + .long 0x1273A0C0 + + .long 0xF2B7C057 + vperm 22,23,24,31 + .long 0xF2F7C357 + + .long 0x12826088 + .long 0x11EFA0C0 + .long 0x12836088 + .long 0x1210A0C0 + .long 0x12846088 + .long 0x1231A0C0 + .long 0x12805888 + .long 0x1252A0C0 + lvx 12,27,11 + .long 0x12815888 + .long 0x1273A0C0 + lvx 11,0,11 + + .long 0x10C5CEC4 + .long 0x10E7D6C4 + .long 0x1128DEC4 + .long 0x1108E6C4 + + .long 0x12817088 + .long 0x11EFA0C0 + .long 0x12827088 + .long 0x1210A0C0 + .long 0x12837088 + .long 0x1231A0C0 + .long 0x12847088 + .long 0x1252A0C0 + lvx 14,29,11 + .long 0x12806888 + .long 0x1273A0C0 + lvx 13,28,11 + + vand 5,5,29 + vand 6,6,29 + vand 7,7,29 + vand 8,8,29 + + .long 0x12846088 + .long 0x11EFA0C0 + .long 0x12805888 + .long 0x1210A0C0 + .long 0x12815888 + .long 0x1231A0C0 + .long 0x12825888 + .long 0x1252A0C0 + .long 0x12835888 + .long 0x1273A0C0 + + .long 0x12D6D6C4 + .long 0x1355CEC4 + .long 0x1317DEC4 + .long 0x12F7E6C4 + + .long 0x12837088 + .long 0x11EFA0C0 + .long 0x12847088 + .long 0x1210A0C0 + .long 0x12806888 + .long 0x1231A0C0 + .long 0x12816888 + .long 0x1252A0C0 + .long 0x12826888 + .long 0x1273A0C0 + + vand 21,21,29 + vand 26,26,29 + vand 22,22,29 + vand 23,23,29 + + + + + + vspltisb 20,2 + .long 0x1092CEC4 + .long 0x102FCEC4 + vand 3,18,29 + vand 0,15,29 + .long 0x108498C0 + .long 0x102180C0 + + .long 0x11384E8C + .long 0x10B52E8C + .long 0x10DA368C + .long 0x10F63E8C + .long 0x1117468C + vor 9,9,30 + + .long 0x1264CEC4 + .long 0x1201CEC4 + vand 4,4,29 + vand 1,1,29 + .long 0x100098C0 + .long 0x105180C0 + + .long 0x1273A5C4 + .long 0x1222CEC4 + vand 2,2,29 + .long 0x100098C0 + .long 0x106388C0 + + .long 0x11E0CEC4 + .long 0x1243CEC4 + vand 0,0,29 + vand 3,3,29 + .long 0x102178C0 + .long 0x108490C0 + + addi 4,4,0x40 + bdnz .Loop_vsx + + neg 5,5 + andi. 5,5,0x30 + sub 4,4,5 + + .long 0x7D5D1E99 + .long 0x7D605699 + .long 0x7D9B5699 + .long 0x7DBC5699 + .long 0x7DDD5699 + +.Last_vsx: + .long 0x11E55288 + .long 0x12065288 + .long 0x12275288 + .long 0x12485288 + .long 0x12695288 + + .long 0x12896288 + .long 0x11EFA0C0 + .long 0x12855A88 + .long 0x1210A0C0 + .long 0x12865A88 + .long 0x1231A0C0 + .long 0x12875A88 + .long 0x1252A0C0 + .long 0x7D9F5699 + .long 0x12885A88 + .long 0x1273A0C0 + .long 0x7D7E5699 + + .long 0x104238C0 + .long 0x100028C0 + .long 0x106340C0 + .long 0x102130C0 + .long 0x108448C0 + + .long 0x12887288 + .long 0x11EFA0C0 + .long 0x12897288 + .long 0x1210A0C0 + .long 0x12856A88 + .long 0x1231A0C0 + .long 0x12866A88 + .long 0x1252A0C0 + .long 0x7DC85699 + .long 0x12876A88 + .long 0x1273A0C0 + .long 0x7DA75699 + + .long 0x12876288 + .long 0x11EFA0C0 + .long 0x12886288 + .long 0x1210A0C0 + .long 0x12896288 + .long 0x1231A0C0 + .long 0x12855A88 + .long 0x1252A0C0 + .long 0x12865A88 + .long 0x1273A0C0 + + .long 0x12867288 + .long 0x11EFA0C0 + .long 0x12877288 + .long 0x1210A0C0 + .long 0x12887288 + .long 0x1231A0C0 + .long 0x12897288 + .long 0x1252A0C0 + .long 0x12856A88 + .long 0x1273A0C0 + + + .long 0x12805088 + .long 0x11EFA0C0 + .long 0x12815088 + .long 0x1210A0C0 + .long 0x12825088 + .long 0x1231A0C0 + .long 0x12835088 + .long 0x1252A0C0 + .long 0x12845088 + .long 0x1273A0C0 + + .long 0x12826088 + .long 0x11EFA0C0 + .long 0x12836088 + .long 0x1210A0C0 + .long 0x12846088 + .long 0x1231A0C0 + .long 0x12805888 + .long 0x1252A0C0 + .long 0x7D9B5699 + .long 0x12815888 + .long 0x1273A0C0 + .long 0x7D605699 + + .long 0x12817088 + .long 0x11EFA0C0 + .long 0x12827088 + .long 0x1210A0C0 + .long 0x12837088 + .long 0x1231A0C0 + .long 0x12847088 + .long 0x1252A0C0 + .long 0x7DDD5699 + .long 0x12806888 + .long 0x1273A0C0 + .long 0x7DBC5699 + + .long 0x12846088 + .long 0x11EFA0C0 + .long 0x12805888 + .long 0x1210A0C0 + .long 0x12815888 + .long 0x1231A0C0 + .long 0x12825888 + .long 0x1252A0C0 + .long 0x12835888 + .long 0x1273A0C0 + + .long 0x12837088 + .long 0x11EFA0C0 + .long 0x12847088 + .long 0x1210A0C0 + .long 0x12806888 + .long 0x1231A0C0 + .long 0x12816888 + .long 0x1252A0C0 + .long 0x12826888 + .long 0x1273A0C0 + + + + + .long 0xF00F7A57 + .long 0xF0308257 + .long 0xF0518A57 + .long 0xF0729257 + .long 0xF0939A57 + .long 0x11EF00C0 + .long 0x121008C0 + .long 0x123110C0 + .long 0x125218C0 + .long 0x127320C0 + + + + + vspltisb 20,2 + .long 0x1092CEC4 + .long 0x102FCEC4 + vand 3,18,29 + vand 0,15,29 + .long 0x108498C0 + .long 0x102180C0 + + .long 0x1264CEC4 + .long 0x1201CEC4 + vand 4,4,29 + vand 1,1,29 + .long 0x100098C0 + .long 0x105180C0 + + .long 0x1273A5C4 + .long 0x1222CEC4 + vand 2,2,29 + .long 0x100098C0 + .long 0x106388C0 + + .long 0x11E0CEC4 + .long 0x1243CEC4 + vand 0,0,29 + vand 3,3,29 + .long 0x102178C0 + .long 0x108490C0 + + beq .Ldone_vsx + + add 6,12,5 + + .long 0x7F406699 + .long 0x7EA02699 + .long 0x7EDB2699 + .long 0x7EFC2699 + .long 0x7F1D2699 + vperm 21,21,21,26 + vperm 22,22,22,26 + vperm 23,23,23,26 + vperm 24,24,24,26 + + .long 0xF0B5B057 + vspltisb 26,4 + vperm 7,21,22,31 + .long 0xF115B357 + + .long 0x10C5CEC4 + .long 0x10E7D6C4 + .long 0x1128DEC4 + .long 0x1108E6C4 + vand 5,5,29 + vand 6,6,29 + vand 7,7,29 + vand 8,8,29 + + .long 0xF297C057 + vperm 21,23,24,31 + .long 0xF2D7C357 + + .long 0x7DE03699 + .long 0x7E1D3699 + + .long 0x12F4CEC4 + .long 0x12B5D6C4 + .long 0x1316DEC4 + .long 0x12D6E6C4 + vand 20,20,29 + vand 23,23,29 + vand 21,21,29 + vand 22,22,29 + + + .long 0x11384E8C + .long 0x10B42E8C + .long 0x10D7368C + .long 0x10F53E8C + .long 0x1116468C + vor 9,9,30 + + vperm 0,0,0,15 + vand 5,5, 16 + vperm 1,1,1,15 + vand 6,6, 16 + vperm 2,2,2,15 + vand 7,7, 16 + vperm 3,3,3,15 + vand 8,8, 16 + vperm 4,4,4,15 + vand 9,9, 16 + + .long 0x10A500C0 + vxor 0,0,0 + .long 0x10C608C0 + vxor 1,1,1 + .long 0x10E710C0 + vxor 2,2,2 + .long 0x110818C0 + vxor 3,3,3 + .long 0x112920C0 + vxor 4,4,4 + + xor. 5,5,5 + b .Last_vsx + +.align 4 +.Ldone_vsx: + lwz 0,388(1) + li 27,4 + li 28,8 + li 29,12 + li 30,16 + .long 0x7C001919 + .long 0x7C3B1919 + .long 0x7C5C1919 + .long 0x7C7D1919 + .long 0x7C9E1919 + + lwz 12,360(1) + mtlr 0 + li 10,167 + li 11,183 + mtspr 256,12 + lvx 20,10,1 + addi 10,10,32 + lvx 21,10,1 + addi 10,10,32 + lvx 22,11,1 + addi 11,11,32 + lvx 23,10,1 + addi 10,10,32 + lvx 24,11,1 + addi 11,11,32 + lvx 25,10,1 + addi 10,10,32 + lvx 26,11,1 + addi 11,11,32 + lvx 27,10,1 + addi 10,10,32 + lvx 28,11,1 + addi 11,11,32 + lvx 29,10,1 + addi 10,10,32 + lvx 30,11,1 + lvx 31,10,1 + lwz 27,364(1) + lwz 28,368(1) + lwz 29,372(1) + lwz 30,376(1) + lwz 31,380(1) + addi 1,1,384 + blr +.long 0 +.byte 0,12,0x04,1,0x80,5,4,0 +.long 0 +.size __poly1305_blocks_vsx,.-__poly1305_blocks_vsx + +.align 6 +.LPICmeup: + mflr 0 + bcl 20,31,$+4 + mflr 12 + addi 12,12,56 + mtlr 0 + blr +.long 0 +.byte 0,12,0x14,0,0,0,0,0 +.space 28 + +.long 0x00000000,0x03ffffff +.long 0x00000000,0x03ffffff +.long 0x00000000,0x0000001a +.long 0x00000000,0x0000001a +.long 0x00000000,0x00000028 +.long 0x00000000,0x00000028 +.long 0x00000000,0x0e0f0001 +.long 0x00000000,0x1e1f1011 +.long 0x01000000,0x01000000 +.long 0x01000000,0x01000000 +.long 0x07060504,0x03020100 +.long 0x0f0e0d0c,0x0b0a0908 + +.long 0x00000000,0x00000000 +.long 0x00000000,0x04050607 +.long 0x04050607,0x00000000 +.long 0x00000000,0x00000000 +.long 0x00000000,0x00000000 +.long 0x04050607,0x00000000 + +.long 0xffffffff,0x00000000 +.long 0xffffffff,0xffffffff +.long 0xffffffff,0x00000000 +.long 0xffffffff,0x00000000 +.long 0x00000000,0x00000000 +.long 0xffffffff,0x00000000 +.byte 80,111,108,121,49,51,48,53,32,102,111,114,32,80,80,67,44,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0 .align 2 diff --git a/sys/crypto/openssl/powerpc/vpaes-ppc.S b/sys/crypto/openssl/powerpc/vpaes-ppc.S --- a/sys/crypto/openssl/powerpc/vpaes-ppc.S +++ b/sys/crypto/openssl/powerpc/vpaes-ppc.S @@ -667,7 +667,7 @@ vor 24,0,0 sub. 30, 30, 0 vperm 0, 0, 0, 29 - vsel 1, 28, 0, 30 + vsel 1,28,0,30 vor 28,0,0 stvx 1, 0, 4 addi 4, 4, 16 @@ -719,7 +719,7 @@ vor 24,25,25 sub. 30, 30, 0 vperm 0, 0, 0, 29 - vsel 1, 28, 0, 30 + vsel 1,28,0,30 vor 28,0,0 stvx 1, 0, 4 addi 4, 4, 16 @@ -1037,7 +1037,7 @@ vperm 0, 0, 0, 29 li 10, 4 - vsel 2, 28, 0, 30 + vsel 2,28,0,30 li 11, 8 stvx 2, 0, 5 li 12, 12 @@ -1059,7 +1059,7 @@ addi 9, 5, -15 vperm 0, 0, 0, 29 li 10, 4 - vsel 2, 28, 0, 30 + vsel 2,28,0,30 li 11, 8 stvx 2, 0, 5 li 12, 12 @@ -1150,7 +1150,7 @@ vsldoi 1, 9, 7, 12 vxor 7, 7, 1 - vspltisb 1, 0x0f + vspltisb 1,0x0f vsldoi 4, 9, 7, 8 @@ -1246,7 +1246,7 @@ vperm 1, 3, 3, 29 - vsel 2, 28, 1, 30 + vsel 2,28,1,30 vor 28,1,1 stvx 2, 0, 5 blr @@ -1297,7 +1297,7 @@ vperm 1, 3, 3, 29 - vsel 2, 28, 1, 30 + vsel 2,28,1,30 vor 28,1,1 stvx 2, 0, 5 blr diff --git a/sys/crypto/openssl/powerpc64/bn-ppc.S b/sys/crypto/openssl/powerpc64/bn-ppc.S new file mode 100644 --- /dev/null +++ b/sys/crypto/openssl/powerpc64/bn-ppc.S @@ -0,0 +1,1876 @@ +/* Do not modify. This file is auto-generated from ppc.pl. */ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +.globl bn_sqr_comba4 +.type bn_sqr_comba4,@function +.globl bn_sqr_comba8 +.type bn_sqr_comba8,@function +.globl bn_mul_comba4 +.type bn_mul_comba4,@function +.globl bn_mul_comba8 +.type bn_mul_comba8,@function +.globl bn_sub_words +.type bn_sub_words,@function +.globl bn_add_words +.type bn_add_words,@function +.globl bn_div_words +.type bn_div_words,@function +.globl bn_sqr_words +.type bn_sqr_words,@function +.globl bn_mul_words +.type bn_mul_words,@function +.globl bn_mul_add_words +.type bn_mul_add_words,@function + + + +.machine "any" +.abiversion 2 +.text + + + + + + + + +.align 4 +bn_sqr_comba4: +.localentry bn_sqr_comba4,0 + + + + + + + + + + + + + + + + + xor 0,0,0 + + + + ld 5,0(4) + mulld 9,5,5 + mulhdu 10,5,5 + + + + + std 9,0(3) + + ld 6,8(4) + mulld 7,5,6 + mulhdu 8,5,6 + + addc 7,7,7 + adde 8,8,8 + addze 9,0 + + + addc 10,7,10 + addze 11,8 + addze 9,9 + + std 10,8(3) + + mulld 7,6,6 + mulhdu 8,6,6 + addc 11,7,11 + adde 9,8,9 + addze 10,0 + + ld 6,16(4) + mulld 7,5,6 + mulhdu 8,5,6 + + addc 7,7,7 + adde 8,8,8 + addze 10,10 + + addc 11,7,11 + adde 9,8,9 + addze 10,10 + std 11,16(3) + + ld 6,24(4) + mulld 7,5,6 + mulhdu 8,5,6 + addc 7,7,7 + adde 8,8,8 + addze 11,0 + + addc 9,7,9 + adde 10,8,10 + addze 11,11 + + ld 5,8(4) + ld 6,16(4) + mulld 7,5,6 + mulhdu 8,5,6 + + addc 7,7,7 + adde 8,8,8 + addze 11,11 + addc 9,7,9 + adde 10,8,10 + addze 11,11 + std 9,24(3) + + mulld 7,6,6 + mulhdu 8,6,6 + addc 10,7,10 + adde 11,8,11 + addze 9,0 + + ld 6,24(4) + mulld 7,5,6 + mulhdu 8,5,6 + addc 7,7,7 + adde 8,8,8 + addze 9,9 + + addc 10,7,10 + adde 11,8,11 + addze 9,9 + std 10,32(3) + + ld 5,16(4) + mulld 7,5,6 + mulhdu 8,5,6 + addc 7,7,7 + adde 8,8,8 + addze 10,0 + + addc 11,7,11 + adde 9,8,9 + addze 10,10 + std 11,40(3) + + mulld 7,6,6 + mulhdu 8,6,6 + addc 9,7,9 + adde 10,8,10 + + std 9,48(3) + std 10,56(3) + blr +.long 0 +.byte 0,12,0x14,0,0,0,2,0 +.long 0 +.size bn_sqr_comba4,.-bn_sqr_comba4 + + + + + + + + +.align 4 +bn_sqr_comba8: +.localentry bn_sqr_comba8,0 + + + + + + + + + + + + + + + + + + + + + xor 0,0,0 + + + + ld 5,0(4) + mulld 9,5,5 + mulhdu 10,5,5 + std 9,0(3) + + ld 6,8(4) + mulld 7,5,6 + mulhdu 8,5,6 + + addc 10,7,10 + adde 11,8,0 + addze 9,0 + + addc 10,7,10 + adde 11,8,11 + addze 9,9 + + std 10,8(3) + + + mulld 7,6,6 + mulhdu 8,6,6 + addc 11,7,11 + adde 9,8,9 + addze 10,0 + + ld 6,16(4) + mulld 7,5,6 + mulhdu 8,5,6 + + addc 11,7,11 + adde 9,8,9 + addze 10,10 + + addc 11,7,11 + adde 9,8,9 + addze 10,10 + + std 11,16(3) + + ld 6,24(4) + mulld 7,5,6 + mulhdu 8,5,6 + + addc 9,7,9 + adde 10,8,10 + addze 11,0 + + addc 9,7,9 + adde 10,8,10 + addze 11,11 + + ld 5,8(4) + ld 6,16(4) + mulld 7,5,6 + mulhdu 8,5,6 + + addc 9,7,9 + adde 10,8,10 + addze 11,11 + + addc 9,7,9 + adde 10,8,10 + addze 11,11 + + std 9,24(3) + + mulld 7,6,6 + mulhdu 8,6,6 + + addc 10,7,10 + adde 11,8,11 + addze 9,0 + + ld 6,24(4) + mulld 7,5,6 + mulhdu 8,5,6 + + addc 10,7,10 + adde 11,8,11 + addze 9,9 + + addc 10,7,10 + adde 11,8,11 + addze 9,9 + + ld 5,0(4) + ld 6,32(4) + mulld 7,5,6 + mulhdu 8,5,6 + + addc 10,7,10 + adde 11,8,11 + addze 9,9 + + addc 10,7,10 + adde 11,8,11 + addze 9,9 + std 10,32(3) + + ld 6,40(4) + mulld 7,5,6 + mulhdu 8,5,6 + + addc 11,7,11 + adde 9,8,9 + addze 10,0 + + addc 11,7,11 + adde 9,8,9 + addze 10,10 + + ld 5,8(4) + ld 6,32(4) + mulld 7,5,6 + mulhdu 8,5,6 + + addc 11,7,11 + adde 9,8,9 + addze 10,10 + + addc 11,7,11 + adde 9,8,9 + addze 10,10 + + ld 5,16(4) + ld 6,24(4) + mulld 7,5,6 + mulhdu 8,5,6 + + addc 11,7,11 + adde 9,8,9 + addze 10,10 + + addc 11,7,11 + adde 9,8,9 + addze 10,10 + std 11,40(3) + + mulld 7,6,6 + mulhdu 8,6,6 + addc 9,7,9 + adde 10,8,10 + addze 11,0 + + ld 6,32(4) + mulld 7,5,6 + mulhdu 8,5,6 + + addc 9,7,9 + adde 10,8,10 + addze 11,11 + + addc 9,7,9 + adde 10,8,10 + addze 11,11 + + ld 5,8(4) + ld 6,40(4) + mulld 7,5,6 + mulhdu 8,5,6 + + addc 9,7,9 + adde 10,8,10 + addze 11,11 + + addc 9,7,9 + adde 10,8,10 + addze 11,11 + + ld 5,0(4) + ld 6,48(4) + mulld 7,5,6 + mulhdu 8,5,6 + addc 9,7,9 + adde 10,8,10 + addze 11,11 + addc 9,7,9 + adde 10,8,10 + addze 11,11 + std 9,48(3) + + ld 6,56(4) + mulld 7,5,6 + mulhdu 8,5,6 + + addc 10,7,10 + adde 11,8,11 + addze 9,0 + addc 10,7,10 + adde 11,8,11 + addze 9,9 + + ld 5,8(4) + ld 6,48(4) + mulld 7,5,6 + mulhdu 8,5,6 + + addc 10,7,10 + adde 11,8,11 + addze 9,9 + addc 10,7,10 + adde 11,8,11 + addze 9,9 + + ld 5,16(4) + ld 6,40(4) + mulld 7,5,6 + mulhdu 8,5,6 + addc 10,7,10 + adde 11,8,11 + addze 9,9 + addc 10,7,10 + adde 11,8,11 + addze 9,9 + + ld 5,24(4) + ld 6,32(4) + mulld 7,5,6 + mulhdu 8,5,6 + + addc 10,7,10 + adde 11,8,11 + addze 9,9 + addc 10,7,10 + adde 11,8,11 + addze 9,9 + std 10,56(3) + + mulld 7,6,6 + mulhdu 8,6,6 + addc 11,7,11 + adde 9,8,9 + addze 10,0 + + ld 6,40(4) + mulld 7,5,6 + mulhdu 8,5,6 + addc 11,7,11 + adde 9,8,9 + addze 10,10 + addc 11,7,11 + adde 9,8,9 + addze 10,10 + + ld 5,16(4) + ld 6,48(4) + mulld 7,5,6 + mulhdu 8,5,6 + addc 11,7,11 + adde 9,8,9 + addze 10,10 + + addc 11,7,11 + adde 9,8,9 + addze 10,10 + + ld 5,8(4) + ld 6,56(4) + mulld 7,5,6 + mulhdu 8,5,6 + addc 11,7,11 + adde 9,8,9 + addze 10,10 + addc 11,7,11 + adde 9,8,9 + addze 10,10 + std 11,64(3) + + ld 5,16(4) + mulld 7,5,6 + mulhdu 8,5,6 + + addc 9,7,9 + adde 10,8,10 + addze 11,0 + addc 9,7,9 + adde 10,8,10 + addze 11,11 + + ld 5,24(4) + ld 6,48(4) + mulld 7,5,6 + mulhdu 8,5,6 + addc 9,7,9 + adde 10,8,10 + addze 11,11 + addc 9,7,9 + adde 10,8,10 + addze 11,11 + + ld 5,32(4) + ld 6,40(4) + mulld 7,5,6 + mulhdu 8,5,6 + addc 9,7,9 + adde 10,8,10 + addze 11,11 + addc 9,7,9 + adde 10,8,10 + addze 11,11 + std 9,72(3) + + mulld 7,6,6 + mulhdu 8,6,6 + addc 10,7,10 + adde 11,8,11 + addze 9,0 + + ld 6,48(4) + mulld 7,5,6 + mulhdu 8,5,6 + addc 10,7,10 + adde 11,8,11 + addze 9,9 + addc 10,7,10 + adde 11,8,11 + addze 9,9 + + ld 5,24(4) + ld 6,56(4) + mulld 7,5,6 + mulhdu 8,5,6 + addc 10,7,10 + adde 11,8,11 + addze 9,9 + addc 10,7,10 + adde 11,8,11 + addze 9,9 + std 10,80(3) + + ld 5,32(4) + mulld 7,5,6 + mulhdu 8,5,6 + addc 11,7,11 + adde 9,8,9 + addze 10,0 + addc 11,7,11 + adde 9,8,9 + addze 10,10 + + ld 5,40(4) + ld 6,48(4) + mulld 7,5,6 + mulhdu 8,5,6 + addc 11,7,11 + adde 9,8,9 + addze 10,10 + addc 11,7,11 + adde 9,8,9 + addze 10,10 + std 11,88(3) + + mulld 7,6,6 + mulhdu 8,6,6 + addc 9,7,9 + adde 10,8,10 + addze 11,0 + + ld 6,56(4) + mulld 7,5,6 + mulhdu 8,5,6 + addc 9,7,9 + adde 10,8,10 + addze 11,11 + addc 9,7,9 + adde 10,8,10 + addze 11,11 + std 9,96(3) + + + ld 5,48(4) + mulld 7,5,6 + mulhdu 8,5,6 + addc 10,7,10 + adde 11,8,11 + addze 9,0 + addc 10,7,10 + adde 11,8,11 + addze 9,9 + std 10,104(3) + + mulld 7,6,6 + mulhdu 8,6,6 + addc 11,7,11 + adde 9,8,9 + std 11,112(3) + std 9, 120(3) + + + blr +.long 0 +.byte 0,12,0x14,0,0,0,2,0 +.long 0 +.size bn_sqr_comba8,.-bn_sqr_comba8 + + + + + + + + +.align 4 +bn_mul_comba4: +.localentry bn_mul_comba4,0 + + + + + + + + + + + + + xor 0,0,0 + + ld 6,0(4) + ld 7,0(5) + mulld 10,6,7 + mulhdu 11,6,7 + std 10,0(3) + + ld 7,8(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,8,11 + adde 12,9,0 + addze 10,0 + + ld 6, 8(4) + ld 7, 0(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,8,11 + adde 12,9,12 + addze 10,10 + std 11,8(3) + + ld 6,16(4) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,8,12 + adde 10,9,10 + addze 11,0 + + ld 6,8(4) + ld 7,8(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,8,12 + adde 10,9,10 + addze 11,11 + + ld 6,0(4) + ld 7,16(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,8,12 + adde 10,9,10 + addze 11,11 + std 12,16(3) + + ld 7,24(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,8,10 + adde 11,9,11 + addze 12,0 + + ld 6,8(4) + ld 7,16(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,8,10 + adde 11,9,11 + addze 12,12 + + ld 6,16(4) + ld 7,8(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,8,10 + adde 11,9,11 + addze 12,12 + + ld 6,24(4) + ld 7,0(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,8,10 + adde 11,9,11 + addze 12,12 + std 10,24(3) + + ld 7,8(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,8,11 + adde 12,9,12 + addze 10,0 + + ld 6,16(4) + ld 7,16(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,8,11 + adde 12,9,12 + addze 10,10 + + ld 6,8(4) + ld 7,24(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,8,11 + adde 12,9,12 + addze 10,10 + std 11,32(3) + + ld 6,16(4) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,8,12 + adde 10,9,10 + addze 11,0 + + ld 6,24(4) + ld 7,16(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,8,12 + adde 10,9,10 + addze 11,11 + std 12,40(3) + + ld 7,24(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,8,10 + adde 11,9,11 + + std 10,48(3) + std 11,56(3) + blr +.long 0 +.byte 0,12,0x14,0,0,0,3,0 +.long 0 +.size bn_mul_comba4,.-bn_mul_comba4 + + + + + + + + +.align 4 +bn_mul_comba8: +.localentry bn_mul_comba8,0 + + + + + + + + + + + + + xor 0,0,0 + + + ld 6,0(4) + ld 7,0(5) + mulld 10,6,7 + mulhdu 11,6,7 + std 10,0(3) + + ld 7,8(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + addze 12,9 + addze 10,0 + + ld 6,8(4) + ld 7,0(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + std 11,8(3) + + ld 6,16(4) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,0 + + ld 6,8(4) + ld 7,8(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + + ld 6,0(4) + ld 7,16(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + std 12,16(3) + + ld 7,24(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,0 + + ld 6,8(4) + ld 7,16(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + + + ld 6,16(4) + ld 7,8(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + + ld 6,24(4) + ld 7,0(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + std 10,24(3) + + ld 6,32(4) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,0 + + ld 6,24(4) + ld 7,8(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + + ld 6,16(4) + ld 7,16(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + + ld 6,8(4) + ld 7,24(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + + ld 6,0(4) + ld 7,32(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + std 11,32(3) + + ld 7,40(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,0 + + ld 6,8(4) + ld 7,32(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + + ld 6,16(4) + ld 7,24(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + + ld 6,24(4) + ld 7,16(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + + ld 6,32(4) + ld 7,8(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + + ld 6,40(4) + ld 7,0(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + std 12,40(3) + + ld 6,48(4) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,0 + + ld 6,40(4) + ld 7,8(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + + ld 6,32(4) + ld 7,16(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + + ld 6,24(4) + ld 7,24(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + + ld 6,16(4) + ld 7,32(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + + ld 6,8(4) + ld 7,40(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + + ld 6,0(4) + ld 7,48(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + std 10,48(3) + + ld 7,56(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,0 + + ld 6,8(4) + ld 7,48(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + + ld 6,16(4) + ld 7,40(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + + ld 6,24(4) + ld 7,32(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + + ld 6,32(4) + ld 7,24(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + + ld 6,40(4) + ld 7,16(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + + ld 6,48(4) + ld 7,8(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + + ld 6,56(4) + ld 7,0(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + std 11,56(3) + + ld 7,8(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,0 + + ld 6,48(4) + ld 7,16(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + + ld 6,40(4) + ld 7,24(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + + ld 6,32(4) + ld 7,32(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + + ld 6,24(4) + ld 7,40(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + + ld 6,16(4) + ld 7,48(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + + ld 6,8(4) + ld 7,56(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + std 12,64(3) + + ld 6,16(4) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,0 + + ld 6,24(4) + ld 7,48(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + + ld 6,32(4) + ld 7,40(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + + ld 6,40(4) + ld 7,32(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + + ld 6,48(4) + ld 7,24(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + + ld 6,56(4) + ld 7,16(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + std 10,72(3) + + ld 7,24(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,0 + + ld 6,48(4) + ld 7,32(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + + ld 6,40(4) + ld 7,40(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + + ld 6,32(4) + ld 7,48(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + + ld 6,24(4) + ld 7,56(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + std 11,80(3) + + ld 6,32(4) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,0 + + ld 6,40(4) + ld 7,48(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + + ld 6,48(4) + ld 7,40(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + + ld 6,56(4) + ld 7,32(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + std 12,88(3) + + ld 7,40(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,0 + + ld 6,48(4) + ld 7,48(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + + ld 6,40(4) + ld 7,56(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + std 10,96(3) + + ld 6,48(4) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,0 + + ld 6,56(4) + ld 7,48(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + std 11,104(3) + + ld 7,56(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + std 12,112(3) + std 10,120(3) + blr +.long 0 +.byte 0,12,0x14,0,0,0,3,0 +.long 0 +.size bn_mul_comba8,.-bn_mul_comba8 + + + + + + + + +.align 4 +bn_sub_words: +.localentry bn_sub_words,0 + + + + + + + + + + + + + + + xor 0,0,0 + + + + subfc. 7,0,6 + + + beq .Lppcasm_sub_adios + addi 4,4,-8 + addi 3,3,-8 + addi 5,5,-8 + mtctr 6 +.Lppcasm_sub_mainloop: + ldu 7,8(4) + ldu 8,8(5) + subfe 6,8,7 + + + stdu 6,8(3) + bdnz .Lppcasm_sub_mainloop +.Lppcasm_sub_adios: + subfze 3,0 + andi. 3,3,1 + blr +.long 0 +.byte 0,12,0x14,0,0,0,4,0 +.long 0 +.size bn_sub_words,.-bn_sub_words + + + + + + + + +.align 4 +bn_add_words: +.localentry bn_add_words,0 + + + + + + + + + + + + + + + xor 0,0,0 + + + + addic. 6,6,0 + beq .Lppcasm_add_adios + addi 4,4,-8 + addi 3,3,-8 + addi 5,5,-8 + mtctr 6 +.Lppcasm_add_mainloop: + ldu 7,8(4) + ldu 8,8(5) + adde 8,7,8 + stdu 8,8(3) + bdnz .Lppcasm_add_mainloop +.Lppcasm_add_adios: + addze 3,0 + blr +.long 0 +.byte 0,12,0x14,0,0,0,4,0 +.long 0 +.size bn_add_words,.-bn_add_words + + + + + + + + +.align 4 +bn_div_words: +.localentry bn_div_words,0 + + + + + + + + + + + + + cmpldi 0,5,0 + bne .Lppcasm_div1 + li 3,-1 + blr +.Lppcasm_div1: + xor 0,0,0 + li 8,64 + cntlzd. 7,5 + beq .Lppcasm_div2 + subf 8,7,8 + srd. 9,3,8 + td 16,9,0 +.Lppcasm_div2: + cmpld 0,3,5 + blt .Lppcasm_div3 + subf 3,5,3 +.Lppcasm_div3: + cmpi 0,0,7,0 + beq .Lppcasm_div4 + sld 3,3,7 + srd 8,4,8 + sld 5,5,7 + or 3,3,8 + sld 4,4,7 +.Lppcasm_div4: + srdi 9,5,32 + + + li 6,2 + mtctr 6 +.Lppcasm_divouterloop: + srdi 8,3,32 + srdi 11,4,32 + + cmpld 0,8,9 + bne .Lppcasm_div5 + + li 8,-1 + clrldi 8,8,32 + b .Lppcasm_div6 +.Lppcasm_div5: + divdu 8,3,9 +.Lppcasm_div6: + mulld 12,9,8 + clrldi 10,5,32 + mulld 6,8,10 + +.Lppcasm_divinnerloop: + subf 10,12,3 + srdi 7,10,32 + addic. 7,7,0 + + + + sldi 7,10,32 + or 7,7,11 + cmpld 1,6,7 + bne .Lppcasm_divinnerexit + ble 1,.Lppcasm_divinnerexit + addi 8,8,-1 + subf 12,9,12 + clrldi 10,5,32 + subf 6,10,6 + b .Lppcasm_divinnerloop +.Lppcasm_divinnerexit: + srdi 10,6,32 + sldi 11,6,32 + cmpld 1,4,11 + add 12,12,10 + bge 1,.Lppcasm_div7 + addi 12,12,1 +.Lppcasm_div7: + subf 11,11,4 + cmpld 1,3,12 + bge 1,.Lppcasm_div8 + addi 8,8,-1 + add 3,5,3 +.Lppcasm_div8: + subf 12,12,3 + sldi 4,11,32 + + + + insrdi 11,12,32,32 + rotldi 3,11,32 + bdz .Lppcasm_div9 + sldi 0,8,32 + b .Lppcasm_divouterloop +.Lppcasm_div9: + or 3,8,0 + blr +.long 0 +.byte 0,12,0x14,0,0,0,3,0 +.long 0 +.size bn_div_words,.-bn_div_words + + + + + + + +.align 4 +bn_sqr_words: +.localentry bn_sqr_words,0 + + + + + + + + + + + + + + + + addic. 5,5,0 + beq .Lppcasm_sqr_adios + addi 4,4,-8 + addi 3,3,-8 + mtctr 5 +.Lppcasm_sqr_mainloop: + + ldu 6,8(4) + mulld 7,6,6 + mulhdu 8,6,6 + stdu 7,8(3) + stdu 8,8(3) + bdnz .Lppcasm_sqr_mainloop +.Lppcasm_sqr_adios: + blr +.long 0 +.byte 0,12,0x14,0,0,0,3,0 +.long 0 +.size bn_sqr_words,.-bn_sqr_words + + + + + + + + +.align 4 +bn_mul_words: +.localentry bn_mul_words,0 + + + + + + + + + xor 0,0,0 + xor 12,12,12 + rlwinm. 7,5,30,2,31 + beq .Lppcasm_mw_REM + mtctr 7 +.Lppcasm_mw_LOOP: + + ld 8,0(4) + mulld 9,6,8 + mulhdu 10,6,8 + addc 9,9,12 + + + + + std 9,0(3) + + ld 8,8(4) + mulld 11,6,8 + mulhdu 12,6,8 + adde 11,11,10 + + std 11,8(3) + + ld 8,16(4) + mulld 9,6,8 + mulhdu 10,6,8 + adde 9,9,12 + + std 9,16(3) + + ld 8,24(4) + mulld 11,6,8 + mulhdu 12,6,8 + adde 11,11,10 + addze 12,12 + + std 11,24(3) + + addi 3,3,32 + addi 4,4,32 + bdnz .Lppcasm_mw_LOOP + +.Lppcasm_mw_REM: + andi. 5,5,0x3 + beq .Lppcasm_mw_OVER + + ld 8,0(4) + mulld 9,6,8 + mulhdu 10,6,8 + addc 9,9,12 + addze 10,10 + std 9,0(3) + addi 12,10,0 + + addi 5,5,-1 + cmpli 0,0,5,0 + beq .Lppcasm_mw_OVER + + + + ld 8,8(4) + mulld 9,6,8 + mulhdu 10,6,8 + addc 9,9,12 + addze 10,10 + std 9,8(3) + addi 12,10,0 + + addi 5,5,-1 + cmpli 0,0,5,0 + beq .Lppcasm_mw_OVER + + + ld 8,16(4) + mulld 9,6,8 + mulhdu 10,6,8 + addc 9,9,12 + addze 10,10 + std 9,16(3) + addi 12,10,0 + +.Lppcasm_mw_OVER: + addi 3,12,0 + blr +.long 0 +.byte 0,12,0x14,0,0,0,4,0 +.long 0 +.size bn_mul_words,.-bn_mul_words + + + + + + + + +.align 4 +bn_mul_add_words: +.localentry bn_mul_add_words,0 + + + + + + + + + + + + xor 0,0,0 + xor 12,12,12 + rlwinm. 7,5,30,2,31 + beq .Lppcasm_maw_leftover + mtctr 7 +.Lppcasm_maw_mainloop: + + ld 8,0(4) + ld 11,0(3) + mulld 9,6,8 + mulhdu 10,6,8 + addc 9,9,12 + addze 10,10 + addc 9,9,11 + + + + + + + std 9,0(3) + + + ld 8,8(4) + ld 9,8(3) + mulld 11,6,8 + mulhdu 12,6,8 + adde 11,11,10 + addze 12,12 + addc 11,11,9 + + std 11,8(3) + + + ld 8,16(4) + mulld 9,6,8 + ld 11,16(3) + mulhdu 10,6,8 + adde 9,9,12 + addze 10,10 + addc 9,9,11 + + std 9,16(3) + + + ld 8,24(4) + mulld 11,6,8 + ld 9,24(3) + mulhdu 12,6,8 + adde 11,11,10 + addze 12,12 + addc 11,11,9 + addze 12,12 + std 11,24(3) + addi 3,3,32 + addi 4,4,32 + bdnz .Lppcasm_maw_mainloop + +.Lppcasm_maw_leftover: + andi. 5,5,0x3 + beq .Lppcasm_maw_adios + addi 3,3,-8 + addi 4,4,-8 + + mtctr 5 + ldu 8,8(4) + mulld 9,6,8 + mulhdu 10,6,8 + ldu 11,8(3) + addc 9,9,11 + addze 10,10 + addc 9,9,12 + addze 12,10 + std 9,0(3) + + bdz .Lppcasm_maw_adios + + ldu 8,8(4) + mulld 9,6,8 + mulhdu 10,6,8 + ldu 11,8(3) + addc 9,9,11 + addze 10,10 + addc 9,9,12 + addze 12,10 + std 9,0(3) + + bdz .Lppcasm_maw_adios + + ldu 8,8(4) + mulld 9,6,8 + mulhdu 10,6,8 + ldu 11,8(3) + addc 9,9,11 + addze 10,10 + addc 9,9,12 + addze 12,10 + std 9,0(3) + +.Lppcasm_maw_adios: + addi 3,12,0 + blr +.long 0 +.byte 0,12,0x14,0,0,0,4,0 +.long 0 +.size bn_mul_add_words,.-bn_mul_add_words +.align 4 diff --git a/sys/crypto/openssl/powerpc64/ecp_nistp521-ppc64.S b/sys/crypto/openssl/powerpc64/ecp_nistp521-ppc64.S new file mode 100644 --- /dev/null +++ b/sys/crypto/openssl/powerpc64/ecp_nistp521-ppc64.S @@ -0,0 +1,354 @@ +/* Do not modify. This file is auto-generated from ecp_nistp521-ppc64.pl. */ +.machine "any" +.abiversion 2 +.text + +.globl p521_felem_mul +.type p521_felem_mul,@function +.align 5 +p521_felem_mul: +.localentry p521_felem_mul,0 + + + mr 12,1 + stdu 1,-16*13(1) + + stxv 52,-16*12(12) + stxv 53,-16*11(12) + stxv 54,-16*10(12) + stxv 55,-16*9(12) + stxv 56,-16*8(12) + stxv 57,-16*7(12) + stxv 58,-16*6(12) + stxv 59,-16*5(12) + stxv 60,-16*4(12) + stxv 61,-16*3(12) + stxv 62,-16*2(12) + stxv 63,-16*1(12) + + vspltisw 0,0 + + lxsd 13,0(4) + lxsd 14,8(4) + lxsd 15,16(4) + lxsd 16,24(4) + lxsd 17,32(4) + lxsd 18,40(4) + lxsd 19,48(4) + lxsd 20,56(4) + lxsd 21,64(4) + + lxsd 3,0(5) + lxsd 4,8(5) + lxsd 5,16(5) + lxsd 6,24(5) + lxsd 7,32(5) + lxsd 8,40(5) + lxsd 9,48(5) + lxsd 10,56(5) + lxsd 11,64(5) + + .long 0x12ED1823 + + xxpermdi 33,45,46,0b00 + xxpermdi 34,36,35,0b00 + .long 0x13011023 + + xxpermdi 34,37,36,0b00 + .long 0x13211023 + .long 0x132F1E63 + + xxpermdi 34,38,37,0b00 + .long 0x13411023 + xxpermdi 44,47,48,0b00 + xxpermdi 54,36,35,0b00 + .long 0x134CB6A3 + + xxpermdi 34,39,38,0b00 + .long 0x13611023 + xxpermdi 54,37,36,0b00 + .long 0x136CB6E3 + .long 0x13711EE3 + + xxpermdi 34,40,39,0b00 + .long 0x13811023 + xxpermdi 54,38,37,0b00 + .long 0x138CB723 + + xxpermdi 34,41,40,0b00 + .long 0x13A11023 + xxpermdi 54,39,38,0b00 + .long 0x13ACB763 + + xxpermdi 34,42,41,0b00 + .long 0x13C11023 + xxpermdi 54,40,39,0b00 + .long 0x13CCB7A3 + + xxpermdi 34,43,42,0b00 + .long 0x13E11023 + xxpermdi 54,41,40,0b00 + .long 0x13ECB7E3 + + xxpermdi 33,49,50,0b00 + xxpermdi 34,36,35,0b00 + .long 0x13811723 + + xxpermdi 34,37,36,0b00 + .long 0x13A11763 + .long 0x13B31F63 + + xxpermdi 34,38,37,0b00 + .long 0x13C117A3 + xxpermdi 44,51,52,0b00 + xxpermdi 54,36,35,0b00 + .long 0x13CCB7A3 + + xxpermdi 34,39,38,0b00 + .long 0x13E117E3 + xxpermdi 54,37,36,0b00 + .long 0x13ECB7E3 + .long 0x13F51FE3 + + li 8,0 + li 9,1 + mtvsrdd 33,9,8 + .long 0x10630DC4 + .long 0x10840DC4 + .long 0x10A50DC4 + .long 0x10C60DC4 + .long 0x10E70DC4 + .long 0x11080DC4 + .long 0x11290DC4 + .long 0x114A0DC4 + .long 0x116B0DC4 + + .long 0x13D55FA3 + + xxpermdi 34,43,42,0b00 + xxpermdi 33,52,53,0b00 + .long 0x13A11763 + + xxpermdi 33,51,52,0b00 + .long 0x13811723 + .long 0x13954F23 + + xxpermdi 33,50,51,0b00 + .long 0x136116E3 + xxpermdi 54,41,40,0b00 + xxpermdi 44,52,53,0b00 + .long 0x136CB6E3 + + xxpermdi 33,49,50,0b00 + .long 0x134116A3 + xxpermdi 44,51,52,0b00 + .long 0x134CB6A3 + .long 0x13553EA3 + + xxpermdi 33,48,49,0b00 + .long 0x13211663 + xxpermdi 44,50,51,0b00 + .long 0x132CB663 + + xxpermdi 33,47,48,0b00 + .long 0x13011623 + xxpermdi 44,49,50,0b00 + .long 0x130CB623 + + xxpermdi 33,46,47,0b00 + .long 0x12E115E3 + xxpermdi 44,48,49,0b00 + .long 0x12ECB5E3 + + xxpermdi 34,39,38,0b00 + xxpermdi 33,52,53,0b00 + .long 0x13211663 + + xxpermdi 33,51,52,0b00 + .long 0x13011623 + .long 0x13152E23 + + xxpermdi 33,50,51,0b00 + .long 0x12E115E3 + xxpermdi 54,37,36,0b00 + xxpermdi 44,52,53,0b00 + .long 0x12ECB5E3 + + stxv 55,0(3) + stxv 56,16(3) + stxv 57,32(3) + stxv 58,48(3) + stxv 59,64(3) + stxv 60,80(3) + stxv 61,96(3) + stxv 62,112(3) + stxv 63,128(3) + + ld 12,0(1) + lxv 52,-16*12(12) + lxv 53,-16*11(12) + lxv 54,-16*10(12) + lxv 55,-16*9(12) + lxv 56,-16*8(12) + lxv 57,-16*7(12) + lxv 58,-16*6(12) + lxv 59,-16*5(12) + lxv 60,-16*4(12) + lxv 61,-16*3(12) + lxv 62,-16*2(12) + lxv 63,-16*1(12) + mr 1,12 + + blr +.size p521_felem_mul,.-p521_felem_mul + +.globl p521_felem_square +.type p521_felem_square,@function +.align 5 +p521_felem_square: +.localentry p521_felem_square,0 + + + mr 12,1 + stdu 1,-16*13(1) + + stxv 52,-16*12(12) + stxv 53,-16*11(12) + stxv 54,-16*10(12) + stxv 55,-16*9(12) + stxv 56,-16*8(12) + stxv 57,-16*7(12) + stxv 58,-16*6(12) + stxv 59,-16*5(12) + stxv 60,-16*4(12) + stxv 61,-16*3(12) + stxv 62,-16*2(12) + stxv 63,-16*1(12) + + vspltisw 0,0 + + lxsd 13,0(4) + lxsd 14,8(4) + lxsd 15,16(4) + lxsd 16,24(4) + lxsd 17,32(4) + lxsd 18,40(4) + lxsd 19,48(4) + lxsd 20,56(4) + lxsd 21,64(4) + + li 8,0 + li 9,1 + mtvsrdd 33,9,8 + .long 0x106D0DC4 + .long 0x108E0DC4 + .long 0x10AF0DC4 + .long 0x10D00DC4 + .long 0x10F10DC4 + .long 0x11120DC4 + .long 0x11330DC4 + .long 0x11540DC4 + .long 0x11750DC4 + .long 0x12ED6823 + + .long 0x130D2023 + + xxpermdi 33,45,46,0b00 + xxpermdi 34,37,46,0b00 + .long 0x13211023 + + xxpermdi 34,38,37,0b00 + .long 0x13411023 + + xxpermdi 34,39,38,0b00 + .long 0x13611023 + .long 0x136F7EE3 + + xxpermdi 34,40,39,0b00 + .long 0x13811023 + .long 0x138F3723 + + xxpermdi 34,41,40,0b00 + .long 0x13A11023 + xxpermdi 44,47,48,0b00 + xxpermdi 54,39,48,0b00 + .long 0x13ACB763 + + xxpermdi 34,42,41,0b00 + .long 0x13C11023 + xxpermdi 54,40,39,0b00 + .long 0x13CCB7A3 + + xxpermdi 34,43,42,0b00 + .long 0x13E11023 + xxpermdi 54,41,40,0b00 + .long 0x13ECB7E3 + .long 0x13F18FE3 + + .long 0x13124623 + + .long 0x13534EA3 + + .long 0x13945723 + + .long 0x13D55FA3 + + mtvsrdd 33,9,8 + .long 0x11080DC4 + .long 0x11290DC4 + .long 0x114A0DC4 + .long 0x116B0DC4 + + .long 0x13B45F63 + + .long 0x13935F23 + + xxpermdi 34,43,42,0b00 + xxpermdi 33,50,51,0b00 + .long 0x136116E3 + + xxpermdi 33,49,50,0b00 + .long 0x134116A3 + + xxpermdi 33,48,49,0b00 + .long 0x13211663 + .long 0x13324E63 + + xxpermdi 33,47,48,0b00 + .long 0x13011623 + .long 0x13114E23 + + xxpermdi 33,46,47,0b00 + .long 0x12E115E3 + xxpermdi 34,41,40,0b00 + xxpermdi 33,48,49,0b00 + .long 0x12E115E3 + + stxv 55,0(3) + stxv 56,16(3) + stxv 57,32(3) + stxv 58,48(3) + stxv 59,64(3) + stxv 60,80(3) + stxv 61,96(3) + stxv 62,112(3) + stxv 63,128(3) + + ld 12,0(1) + lxv 52,-16*12(12) + lxv 53,-16*11(12) + lxv 54,-16*10(12) + lxv 55,-16*9(12) + lxv 56,-16*8(12) + lxv 57,-16*7(12) + lxv 58,-16*6(12) + lxv 59,-16*5(12) + lxv 60,-16*4(12) + lxv 61,-16*3(12) + lxv 62,-16*2(12) + lxv 63,-16*1(12) + mr 1,12 + + blr +.size p521_felem_square,.-p521_felem_square + diff --git a/sys/crypto/openssl/powerpc64/keccak1600-ppc64.S b/sys/crypto/openssl/powerpc64/keccak1600-ppc64.S --- a/sys/crypto/openssl/powerpc64/keccak1600-ppc64.S +++ b/sys/crypto/openssl/powerpc64/keccak1600-ppc64.S @@ -304,19 +304,19 @@ dword_le_load: .localentry dword_le_load,0 - lbzu 0,1(3) - lbzu 4,1(3) - lbzu 5,1(3) + lbz 0,1(3) + lbz 4,2(3) + lbz 5,3(3) insrdi 0,4,8,48 - lbzu 4,1(3) + lbz 4,4(3) insrdi 0,5,8,40 - lbzu 5,1(3) + lbz 5,5(3) insrdi 0,4,8,32 - lbzu 4,1(3) + lbz 4,6(3) insrdi 0,5,8,24 - lbzu 5,1(3) + lbz 5,7(3) insrdi 0,4,8,16 - lbzu 4,1(3) + lbzu 4,8(3) insrdi 0,5,8,8 insrdi 0,4,8,0 blr @@ -579,21 +579,21 @@ cmpldi 30,8 blt .Lsqueeze_tail - stbu 0,1(29) + stb 0,1(29) srdi 0,0,8 - stbu 0,1(29) + stb 0,2(29) srdi 0,0,8 - stbu 0,1(29) + stb 0,3(29) srdi 0,0,8 - stbu 0,1(29) + stb 0,4(29) srdi 0,0,8 - stbu 0,1(29) + stb 0,5(29) srdi 0,0,8 - stbu 0,1(29) + stb 0,6(29) srdi 0,0,8 - stbu 0,1(29) + stb 0,7(29) srdi 0,0,8 - stbu 0,1(29) + stbu 0,8(29) subic. 30,30,8 beq .Lsqueeze_done diff --git a/sys/crypto/openssl/powerpc64/poly1305-ppc.S b/sys/crypto/openssl/powerpc64/poly1305-ppc.S --- a/sys/crypto/openssl/powerpc64/poly1305-ppc.S +++ b/sys/crypto/openssl/powerpc64/poly1305-ppc.S @@ -12,6 +12,7 @@ std 0,0(3) std 0,8(3) std 0,16(3) + stw 0,24(3) cmpld 4,0 beq- .Lno_key @@ -48,6 +49,7 @@ poly1305_blocks: .localentry poly1305_blocks,0 +.Lpoly1305_blocks: srdi. 5,5,4 beq- .Labort @@ -138,48 +140,1003 @@ .long 0 .byte 0,12,4,1,0x80,5,4,0 .size poly1305_blocks,.-poly1305_blocks - .globl poly1305_emit .type poly1305_emit,@function -.align 4 +.align 5 poly1305_emit: .localentry poly1305_emit,0 - ld 7,0(3) - ld 8,8(3) - ld 9,16(3) - ld 6,0(5) - ld 5,8(5) + lwz 7,0(3) + lwz 8,4(3) + lwz 9,8(3) + lwz 10,12(3) + lwz 11,16(3) + lwz 0,24(3) + + sldi 8,8,26 + sldi 12,9,52 + srdi 9,9,12 + sldi 10,10,14 + add 7,7,8 + addc 7,7,12 + sldi 12,11,40 + srdi 11,11,24 + adde 8,9,10 + addc 8,8,12 + addze 9,11 + + ld 10,0(3) + ld 11,8(3) + ld 12,16(3) + + neg 0,0 + xor 7,7,10 + xor 8,8,11 + xor 9,9,12 + and 7,7,0 + and 8,8,0 + and 9,9,0 + xor 7,7,10 + xor 8,8,11 + xor 9,9,12 addic 10,7,5 addze 11,8 addze 12,9 - srdi 0,12,2 - neg 0,0 + srdi 12,12,2 + neg 12,12 - andc 7,7,0 - and 10,10,0 - andc 8,8,0 - and 11,11,0 + andc 7,7,12 + and 10,10,12 + andc 8,8,12 + and 11,11,12 or 7,7,10 or 8,8,11 - rotldi 6,6,32 - rotldi 5,5,32 - addc 7,7,6 - adde 8,8,5 - rldicl 0,7,32,32 - li 10,4 - stwbrx 7,0,4 - rldicl 7,8,32,32 - li 11,8 - stwbrx 0,10,4 - li 12,12 - stwbrx 8,11,4 - stwbrx 7,12,4 + + lwz 12,4(5) + lwz 9,12(5) + lwz 10,0(5) + lwz 11,8(5) + + insrdi 10,12,32,0 + insrdi 11,9,32,0 + + addc 7,7,10 + adde 8,8,11 + + addi 3,4,-1 + addi 4,4,7 + + stbu 7,1(3) + srdi 7,7,8 + stbu 8,1(4) + srdi 8,8,8 + + stbu 7,1(3) + srdi 7,7,8 + stbu 8,1(4) + srdi 8,8,8 + + stbu 7,1(3) + srdi 7,7,8 + stbu 8,1(4) + srdi 8,8,8 + + stbu 7,1(3) + srdi 7,7,8 + stbu 8,1(4) + srdi 8,8,8 + + stbu 7,1(3) + srdi 7,7,8 + stbu 8,1(4) + srdi 8,8,8 + + stbu 7,1(3) + srdi 7,7,8 + stbu 8,1(4) + srdi 8,8,8 + + stbu 7,1(3) + srdi 7,7,8 + stbu 8,1(4) + srdi 8,8,8 + + stbu 7,1(3) + stbu 8,1(4) + blr .long 0 .byte 0,12,0x14,0,0,0,3,0 .size poly1305_emit,.-poly1305_emit -.byte 80,111,108,121,49,51,48,53,32,102,111,114,32,80,80,67,44,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.globl poly1305_blocks_vsx +.type poly1305_blocks_vsx,@function +.align 5 +poly1305_blocks_vsx: +.localentry poly1305_blocks_vsx,0 + + lwz 7,24(3) + cmpldi 5,128 + bge __poly1305_blocks_vsx + + neg 0,7 + lwz 7,0(3) + lwz 8,4(3) + lwz 9,8(3) + lwz 10,12(3) + lwz 11,16(3) + + sldi 8,8,26 + sldi 12,9,52 + add 7,7,8 + srdi 9,9,12 + sldi 10,10,14 + addc 7,7,12 + sldi 8,11,40 + adde 9,9,10 + srdi 11,11,24 + addc 9,9,8 + addze 11,11 + + ld 8,0(3) + ld 10,8(3) + ld 12,16(3) + + xor 7,7,8 + xor 9,9,10 + xor 11,11,12 + and 7,7,0 + and 9,9,0 + and 11,11,0 + xor 7,7,8 + xor 9,9,10 + xor 11,11,12 + + li 0,0 + std 7,0(3) + std 9,8(3) + std 11,16(3) + stw 0,24(3) + + b .Lpoly1305_blocks +.long 0 +.byte 0,12,0x14,0,0,0,4,0 +.size poly1305_blocks_vsx,.-poly1305_blocks_vsx + +.align 5 +__poly1305_mul: + mulld 9,6,27 + mulhdu 10,6,27 + + mulld 30,7,29 + mulhdu 31,7,29 + addc 9,9,30 + adde 10,10,31 + + mulld 30,6,28 + mulhdu 11,6,28 + addc 10,10,30 + addze 11,11 + + mulld 30,7,27 + mulhdu 31,7,27 + addc 10,10,30 + adde 11,11,31 + + mulld 30,8,29 + mulld 31,8,27 + addc 10,10,30 + adde 11,11,31 + + andc 30,11,0 + and 8,11,0 + srdi 31,30,2 + add 30,30,31 + addc 6,9,30 + addze 7,10 + addze 8,8 + + blr +.long 0 +.byte 0,12,0x14,0,0,0,0,0 +.size __poly1305_mul,.-__poly1305_mul + +.align 5 +__poly1305_splat: + rldicl 9,6,0,38 + rldicl 10,6,38,38 + stw 9,0x00(31) + + rldicl 11,6,12,52 + slwi 9,10,2 + stw 10,0x10(31) + add 9,9,10 + stw 9,0x20(31) + + insrdi 11,7,14,38 + slwi 9,11,2 + stw 11,0x30(31) + add 9,9,11 + stw 9,0x40(31) + + rldicl 10,7,50,38 + rldicl 11,7,24,40 + slwi 9,10,2 + stw 10,0x50(31) + add 9,9,10 + stw 9,0x60(31) + + insrdi 11,8,3,37 + slwi 9,11,2 + stw 11,0x70(31) + add 9,9,11 + stw 9,0x80(31) + + blr +.long 0 +.byte 0,12,0x14,0,0,0,0,0 +.size __poly1305_splat,.-__poly1305_splat + +.align 5 +__poly1305_blocks_vsx: + stdu 1,-432(1) + mflr 0 + li 10,191 + li 11,207 + li 12,-1 + stvx 20,10,1 + addi 10,10,32 + stvx 21,11,1 + addi 11,11,32 + stvx 22,10,1 + addi 10,10,32 + stvx 23,10,1 + addi 10,10,32 + stvx 24,11,1 + addi 11,11,32 + stvx 25,10,1 + addi 10,10,32 + stvx 26,10,1 + addi 10,10,32 + stvx 27,11,1 + addi 11,11,32 + stvx 28,10,1 + addi 10,10,32 + stvx 29,11,1 + addi 11,11,32 + stvx 30,10,1 + stvx 31,11,1 + stw 12,388(1) + li 12,-1 + or 12,12,12 + std 27,392(1) + std 28,400(1) + std 29,408(1) + std 30,416(1) + std 31,424(1) + std 0,448(1) + + bl .LPICmeup + + li 27,0x10 + li 28,0x20 + li 29,0x30 + li 30,0x40 + li 31,0x50 + .long 0x7FA06699 + .long 0x7F3B6699 + .long 0x7F7C6699 + .long 0x7FFD6699 + .long 0x7FDE6699 + + cmplwi 7,0 + bne .Lskip_init_vsx + + ld 27,32(3) + ld 28,40(3) + srdi 29,28,2 + li 0,3 + add 29,29,28 + + mr 6,27 + mr 7,28 + li 8,0 + addi 31,3,56 + bl __poly1305_splat + + bl __poly1305_mul + addi 31,3,48 + bl __poly1305_splat + + bl __poly1305_mul + addi 31,3,60 + bl __poly1305_splat + + bl __poly1305_mul + addi 31,3,52 + bl __poly1305_splat + + ld 6,0(3) + ld 7,8(3) + ld 8,16(3) + + rldicl 9,6,0,38 + rldicl 10,6,38,38 + rldicl 11,6,12,52 + .long 0x7C0901E7 + insrdi 11,7,14,38 + .long 0x7C2A01E7 + rldicl 10,7,50,38 + .long 0x7C4B01E7 + rldicl 11,7,24,40 + .long 0x7C6A01E7 + insrdi 11,8,3,37 + .long 0x7C8B01E7 + li 0,1 + stw 0,24(3) + b .Loaded_vsx + +.align 4 +.Lskip_init_vsx: + li 27,4 + li 28,8 + li 29,12 + li 30,16 + .long 0x7C001819 + .long 0x7C3B1819 + .long 0x7C5C1819 + .long 0x7C7D1819 + .long 0x7C9E1819 + +.Loaded_vsx: + li 27,0x10 + li 28,0x20 + li 29,0x30 + li 30,0x40 + li 31,0x50 + li 7,0x60 + li 8,0x70 + addi 10,3,64 + addi 11,1,63 + + vxor 20,20,20 + .long 0xF000A057 + .long 0xF021A057 + .long 0xF042A057 + .long 0xF063A057 + .long 0xF084A057 + + .long 0x7F5F6699 + .long 0x7EA02699 + .long 0x7EDB2699 + .long 0x7EFC2699 + .long 0x7F1D2699 + vperm 21,21,21,26 + vperm 22,22,22,26 + vperm 23,23,23,26 + vperm 24,24,24,26 + + .long 0xF0B5B057 + vspltisb 26,4 + vperm 7,21,22,31 + vspltisb 28,14 + .long 0xF115B357 + + .long 0x10C5CEC4 + .long 0x10E7D6C4 + .long 0x1128DEC4 + .long 0x1108E6C4 + vand 5,5,29 + vand 6,6,29 + vand 7,7,29 + vand 8,8,29 + + .long 0xF2B7C057 + vperm 22,23,24,31 + .long 0xF2F7C357 + + .long 0x1295CEC4 + .long 0x12D6D6C4 + .long 0x1317DEC4 + .long 0x12F7E6C4 + vand 21,21,29 + vand 20,20,29 + vand 22,22,29 + vand 23,23,29 + + + .long 0x11384E8C + .long 0x10B52E8C + .long 0x10D4368C + .long 0x10F63E8C + .long 0x1117468C + vor 9,9,30 + + .long 0x7D5D1A99 + .long 0x7D605299 + .long 0x7D9B5299 + .long 0x7DBC5299 + .long 0x7DDD5299 + .long 0x7EBE5299 + .long 0x7EDF5299 + .long 0x7EE75299 + .long 0x7F085299 + stvx 11,0,11 + stvx 12,27,11 + stvx 13,28,11 + stvx 14,29,11 + stvx 21,30,11 + stvx 22,31,11 + stvx 23,7,11 + stvx 24,8,11 + + addi 4,4,0x40 + addi 12,12,0x50 + addi 0,5,-64 + srdi 0,0,6 + mtctr 0 + b .Loop_vsx + +.align 4 +.Loop_vsx: + + + + + + + + + + + + + + + .long 0x11E55288 + .long 0x12055A88 + .long 0x12256A88 + .long 0x12466A88 + + .long 0x12865288 + .long 0x1210A0C0 + .long 0x12865A88 + .long 0x1231A0C0 + .long 0x12676A88 + .long 0x12896288 + .long 0x11EFA0C0 + .long 0x12875A88 + .long 0x1252A0C0 + lvx 12,31,11 + .long 0x12885A88 + .long 0x1273A0C0 + lvx 11,30,11 + + .long 0x104238C0 + .long 0x100028C0 + .long 0x106340C0 + .long 0x102130C0 + .long 0x108448C0 + + .long 0x12887288 + .long 0x11EFA0C0 + .long 0x12897288 + .long 0x1210A0C0 + .long 0x12875288 + .long 0x1231A0C0 + .long 0x12885288 + .long 0x1252A0C0 + lvx 14,8,11 + .long 0x12895288 + .long 0x1273A0C0 + lvx 13,7,11 + + .long 0x12876288 + .long 0x11EFA0C0 + .long 0x12886288 + .long 0x1210A0C0 + .long 0x12896288 + .long 0x1231A0C0 + .long 0x12855A88 + .long 0x1252A0C0 + .long 0x12865A88 + .long 0x1273A0C0 + + .long 0x7F406699 + .long 0x7EA02699 + .long 0x7EDB2699 + .long 0x7EFC2699 + .long 0x7F1D2699 + vperm 21,21,21,26 + vperm 22,22,22,26 + vperm 23,23,23,26 + vperm 24,24,24,26 + + .long 0x12867288 + .long 0x11EFA0C0 + .long 0x12877288 + .long 0x1210A0C0 + .long 0x12887288 + .long 0x1231A0C0 + .long 0x12897288 + .long 0x1252A0C0 + .long 0x12856A88 + .long 0x1273A0C0 + + .long 0xF0B5B057 + vspltisb 26,4 + vperm 7,21,22,31 + .long 0xF115B357 + + + .long 0x12805088 + .long 0x11EFA0C0 + .long 0x12815088 + .long 0x1210A0C0 + .long 0x12825088 + .long 0x1231A0C0 + .long 0x12835088 + .long 0x1252A0C0 + .long 0x12845088 + .long 0x1273A0C0 + + .long 0xF2B7C057 + vperm 22,23,24,31 + .long 0xF2F7C357 + + .long 0x12826088 + .long 0x11EFA0C0 + .long 0x12836088 + .long 0x1210A0C0 + .long 0x12846088 + .long 0x1231A0C0 + .long 0x12805888 + .long 0x1252A0C0 + lvx 12,27,11 + .long 0x12815888 + .long 0x1273A0C0 + lvx 11,0,11 + + .long 0x10C5CEC4 + .long 0x10E7D6C4 + .long 0x1128DEC4 + .long 0x1108E6C4 + + .long 0x12817088 + .long 0x11EFA0C0 + .long 0x12827088 + .long 0x1210A0C0 + .long 0x12837088 + .long 0x1231A0C0 + .long 0x12847088 + .long 0x1252A0C0 + lvx 14,29,11 + .long 0x12806888 + .long 0x1273A0C0 + lvx 13,28,11 + + vand 5,5,29 + vand 6,6,29 + vand 7,7,29 + vand 8,8,29 + + .long 0x12846088 + .long 0x11EFA0C0 + .long 0x12805888 + .long 0x1210A0C0 + .long 0x12815888 + .long 0x1231A0C0 + .long 0x12825888 + .long 0x1252A0C0 + .long 0x12835888 + .long 0x1273A0C0 + + .long 0x12D6D6C4 + .long 0x1355CEC4 + .long 0x1317DEC4 + .long 0x12F7E6C4 + + .long 0x12837088 + .long 0x11EFA0C0 + .long 0x12847088 + .long 0x1210A0C0 + .long 0x12806888 + .long 0x1231A0C0 + .long 0x12816888 + .long 0x1252A0C0 + .long 0x12826888 + .long 0x1273A0C0 + + vand 21,21,29 + vand 26,26,29 + vand 22,22,29 + vand 23,23,29 + + + + + + vspltisb 20,2 + .long 0x1092CEC4 + .long 0x102FCEC4 + vand 3,18,29 + vand 0,15,29 + .long 0x108498C0 + .long 0x102180C0 + + .long 0x11384E8C + .long 0x10B52E8C + .long 0x10DA368C + .long 0x10F63E8C + .long 0x1117468C + vor 9,9,30 + + .long 0x1264CEC4 + .long 0x1201CEC4 + vand 4,4,29 + vand 1,1,29 + .long 0x100098C0 + .long 0x105180C0 + + .long 0x1273A5C4 + .long 0x1222CEC4 + vand 2,2,29 + .long 0x100098C0 + .long 0x106388C0 + + .long 0x11E0CEC4 + .long 0x1243CEC4 + vand 0,0,29 + vand 3,3,29 + .long 0x102178C0 + .long 0x108490C0 + + addi 4,4,0x40 + bdnz .Loop_vsx + + neg 5,5 + andi. 5,5,0x30 + sub 4,4,5 + + .long 0x7D5D1E99 + .long 0x7D605699 + .long 0x7D9B5699 + .long 0x7DBC5699 + .long 0x7DDD5699 + +.Last_vsx: + .long 0x11E55288 + .long 0x12065288 + .long 0x12275288 + .long 0x12485288 + .long 0x12695288 + + .long 0x12896288 + .long 0x11EFA0C0 + .long 0x12855A88 + .long 0x1210A0C0 + .long 0x12865A88 + .long 0x1231A0C0 + .long 0x12875A88 + .long 0x1252A0C0 + .long 0x7D9F5699 + .long 0x12885A88 + .long 0x1273A0C0 + .long 0x7D7E5699 + + .long 0x104238C0 + .long 0x100028C0 + .long 0x106340C0 + .long 0x102130C0 + .long 0x108448C0 + + .long 0x12887288 + .long 0x11EFA0C0 + .long 0x12897288 + .long 0x1210A0C0 + .long 0x12856A88 + .long 0x1231A0C0 + .long 0x12866A88 + .long 0x1252A0C0 + .long 0x7DC85699 + .long 0x12876A88 + .long 0x1273A0C0 + .long 0x7DA75699 + + .long 0x12876288 + .long 0x11EFA0C0 + .long 0x12886288 + .long 0x1210A0C0 + .long 0x12896288 + .long 0x1231A0C0 + .long 0x12855A88 + .long 0x1252A0C0 + .long 0x12865A88 + .long 0x1273A0C0 + + .long 0x12867288 + .long 0x11EFA0C0 + .long 0x12877288 + .long 0x1210A0C0 + .long 0x12887288 + .long 0x1231A0C0 + .long 0x12897288 + .long 0x1252A0C0 + .long 0x12856A88 + .long 0x1273A0C0 + + + .long 0x12805088 + .long 0x11EFA0C0 + .long 0x12815088 + .long 0x1210A0C0 + .long 0x12825088 + .long 0x1231A0C0 + .long 0x12835088 + .long 0x1252A0C0 + .long 0x12845088 + .long 0x1273A0C0 + + .long 0x12826088 + .long 0x11EFA0C0 + .long 0x12836088 + .long 0x1210A0C0 + .long 0x12846088 + .long 0x1231A0C0 + .long 0x12805888 + .long 0x1252A0C0 + .long 0x7D9B5699 + .long 0x12815888 + .long 0x1273A0C0 + .long 0x7D605699 + + .long 0x12817088 + .long 0x11EFA0C0 + .long 0x12827088 + .long 0x1210A0C0 + .long 0x12837088 + .long 0x1231A0C0 + .long 0x12847088 + .long 0x1252A0C0 + .long 0x7DDD5699 + .long 0x12806888 + .long 0x1273A0C0 + .long 0x7DBC5699 + + .long 0x12846088 + .long 0x11EFA0C0 + .long 0x12805888 + .long 0x1210A0C0 + .long 0x12815888 + .long 0x1231A0C0 + .long 0x12825888 + .long 0x1252A0C0 + .long 0x12835888 + .long 0x1273A0C0 + + .long 0x12837088 + .long 0x11EFA0C0 + .long 0x12847088 + .long 0x1210A0C0 + .long 0x12806888 + .long 0x1231A0C0 + .long 0x12816888 + .long 0x1252A0C0 + .long 0x12826888 + .long 0x1273A0C0 + + + + + .long 0xF00F7A57 + .long 0xF0308257 + .long 0xF0518A57 + .long 0xF0729257 + .long 0xF0939A57 + .long 0x11EF00C0 + .long 0x121008C0 + .long 0x123110C0 + .long 0x125218C0 + .long 0x127320C0 + + + + + vspltisb 20,2 + .long 0x1092CEC4 + .long 0x102FCEC4 + vand 3,18,29 + vand 0,15,29 + .long 0x108498C0 + .long 0x102180C0 + + .long 0x1264CEC4 + .long 0x1201CEC4 + vand 4,4,29 + vand 1,1,29 + .long 0x100098C0 + .long 0x105180C0 + + .long 0x1273A5C4 + .long 0x1222CEC4 + vand 2,2,29 + .long 0x100098C0 + .long 0x106388C0 + + .long 0x11E0CEC4 + .long 0x1243CEC4 + vand 0,0,29 + vand 3,3,29 + .long 0x102178C0 + .long 0x108490C0 + + beq .Ldone_vsx + + add 6,12,5 + + .long 0x7F406699 + .long 0x7EA02699 + .long 0x7EDB2699 + .long 0x7EFC2699 + .long 0x7F1D2699 + vperm 21,21,21,26 + vperm 22,22,22,26 + vperm 23,23,23,26 + vperm 24,24,24,26 + + .long 0xF0B5B057 + vspltisb 26,4 + vperm 7,21,22,31 + .long 0xF115B357 + + .long 0x10C5CEC4 + .long 0x10E7D6C4 + .long 0x1128DEC4 + .long 0x1108E6C4 + vand 5,5,29 + vand 6,6,29 + vand 7,7,29 + vand 8,8,29 + + .long 0xF297C057 + vperm 21,23,24,31 + .long 0xF2D7C357 + + .long 0x7DE03699 + .long 0x7E1D3699 + + .long 0x12F4CEC4 + .long 0x12B5D6C4 + .long 0x1316DEC4 + .long 0x12D6E6C4 + vand 20,20,29 + vand 23,23,29 + vand 21,21,29 + vand 22,22,29 + + + .long 0x11384E8C + .long 0x10B42E8C + .long 0x10D7368C + .long 0x10F53E8C + .long 0x1116468C + vor 9,9,30 + + vperm 0,0,0,15 + vand 5,5, 16 + vperm 1,1,1,15 + vand 6,6, 16 + vperm 2,2,2,15 + vand 7,7, 16 + vperm 3,3,3,15 + vand 8,8, 16 + vperm 4,4,4,15 + vand 9,9, 16 + + .long 0x10A500C0 + vxor 0,0,0 + .long 0x10C608C0 + vxor 1,1,1 + .long 0x10E710C0 + vxor 2,2,2 + .long 0x110818C0 + vxor 3,3,3 + .long 0x112920C0 + vxor 4,4,4 + + xor. 5,5,5 + b .Last_vsx + +.align 4 +.Ldone_vsx: + ld 0,448(1) + li 27,4 + li 28,8 + li 29,12 + li 30,16 + .long 0x7C001919 + .long 0x7C3B1919 + .long 0x7C5C1919 + .long 0x7C7D1919 + .long 0x7C9E1919 + + lwz 12,388(1) + mtlr 0 + li 10,191 + li 11,207 + or 12,12,12 + lvx 20,10,1 + addi 10,10,32 + lvx 21,10,1 + addi 10,10,32 + lvx 22,11,1 + addi 11,11,32 + lvx 23,10,1 + addi 10,10,32 + lvx 24,11,1 + addi 11,11,32 + lvx 25,10,1 + addi 10,10,32 + lvx 26,11,1 + addi 11,11,32 + lvx 27,10,1 + addi 10,10,32 + lvx 28,11,1 + addi 11,11,32 + lvx 29,10,1 + addi 10,10,32 + lvx 30,11,1 + lvx 31,10,1 + ld 27,392(1) + ld 28,400(1) + ld 29,408(1) + ld 30,416(1) + ld 31,424(1) + addi 1,1,432 + blr +.long 0 +.byte 0,12,0x04,1,0x80,5,4,0 +.long 0 +.size __poly1305_blocks_vsx,.-__poly1305_blocks_vsx + +.align 6 +.LPICmeup: + mflr 0 + bcl 20,31,$+4 + mflr 12 + addi 12,12,56 + mtlr 0 + blr +.long 0 +.byte 0,12,0x14,0,0,0,0,0 +.space 28 + +.long 0x00000000,0x03ffffff +.long 0x00000000,0x03ffffff +.long 0x00000000,0x0000001a +.long 0x00000000,0x0000001a +.long 0x00000000,0x00000028 +.long 0x00000000,0x00000028 +.long 0x00000000,0x0e0f0001 +.long 0x00000000,0x1e1f1011 +.long 0x01000000,0x01000000 +.long 0x01000000,0x01000000 +.long 0x07060504,0x03020100 +.long 0x0f0e0d0c,0x0b0a0908 + +.long 0x00000000,0x00000000 +.long 0x00000000,0x04050607 +.long 0x04050607,0x00000000 +.long 0x00000000,0x00000000 +.long 0x00000000,0x00000000 +.long 0x04050607,0x00000000 + +.long 0xffffffff,0x00000000 +.long 0xffffffff,0xffffffff +.long 0xffffffff,0x00000000 +.long 0xffffffff,0x00000000 +.long 0x00000000,0x00000000 +.long 0xffffffff,0x00000000 +.byte 80,111,108,121,49,51,48,53,32,102,111,114,32,80,80,67,44,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0 .align 2 diff --git a/sys/crypto/openssl/powerpc64/vpaes-ppc.S b/sys/crypto/openssl/powerpc64/vpaes-ppc.S --- a/sys/crypto/openssl/powerpc64/vpaes-ppc.S +++ b/sys/crypto/openssl/powerpc64/vpaes-ppc.S @@ -674,7 +674,7 @@ vor 24,0,0 sub. 30, 30, 0 vperm 0, 0, 0, 29 - vsel 1, 28, 0, 30 + vsel 1,28,0,30 vor 28,0,0 stvx 1, 0, 4 addi 4, 4, 16 @@ -726,7 +726,7 @@ vor 24,25,25 sub. 30, 30, 0 vperm 0, 0, 0, 29 - vsel 1, 28, 0, 30 + vsel 1,28,0,30 vor 28,0,0 stvx 1, 0, 4 addi 4, 4, 16 @@ -1044,7 +1044,7 @@ vperm 0, 0, 0, 29 li 10, 4 - vsel 2, 28, 0, 30 + vsel 2,28,0,30 li 11, 8 stvx 2, 0, 5 li 12, 12 @@ -1066,7 +1066,7 @@ addi 9, 5, -15 vperm 0, 0, 0, 29 li 10, 4 - vsel 2, 28, 0, 30 + vsel 2,28,0,30 li 11, 8 stvx 2, 0, 5 li 12, 12 @@ -1157,7 +1157,7 @@ vsldoi 1, 9, 7, 12 vxor 7, 7, 1 - vspltisb 1, 0x0f + vspltisb 1,0x0f vsldoi 4, 9, 7, 8 @@ -1253,7 +1253,7 @@ vperm 1, 3, 3, 29 - vsel 2, 28, 1, 30 + vsel 2,28,1,30 vor 28,1,1 stvx 2, 0, 5 blr @@ -1304,7 +1304,7 @@ vperm 1, 3, 3, 29 - vsel 2, 28, 1, 30 + vsel 2,28,1,30 vor 28,1,1 stvx 2, 0, 5 blr diff --git a/sys/crypto/openssl/powerpc64le/bn-ppc.S b/sys/crypto/openssl/powerpc64le/bn-ppc.S new file mode 100644 --- /dev/null +++ b/sys/crypto/openssl/powerpc64le/bn-ppc.S @@ -0,0 +1,1876 @@ +/* Do not modify. This file is auto-generated from ppc.pl. */ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +.globl bn_sqr_comba4 +.type bn_sqr_comba4,@function +.globl bn_sqr_comba8 +.type bn_sqr_comba8,@function +.globl bn_mul_comba4 +.type bn_mul_comba4,@function +.globl bn_mul_comba8 +.type bn_mul_comba8,@function +.globl bn_sub_words +.type bn_sub_words,@function +.globl bn_add_words +.type bn_add_words,@function +.globl bn_div_words +.type bn_div_words,@function +.globl bn_sqr_words +.type bn_sqr_words,@function +.globl bn_mul_words +.type bn_mul_words,@function +.globl bn_mul_add_words +.type bn_mul_add_words,@function + + + +.machine "any" +.abiversion 2 +.text + + + + + + + + +.align 4 +bn_sqr_comba4: +.localentry bn_sqr_comba4,0 + + + + + + + + + + + + + + + + + xor 0,0,0 + + + + ld 5,0(4) + mulld 9,5,5 + mulhdu 10,5,5 + + + + + std 9,0(3) + + ld 6,8(4) + mulld 7,5,6 + mulhdu 8,5,6 + + addc 7,7,7 + adde 8,8,8 + addze 9,0 + + + addc 10,7,10 + addze 11,8 + addze 9,9 + + std 10,8(3) + + mulld 7,6,6 + mulhdu 8,6,6 + addc 11,7,11 + adde 9,8,9 + addze 10,0 + + ld 6,16(4) + mulld 7,5,6 + mulhdu 8,5,6 + + addc 7,7,7 + adde 8,8,8 + addze 10,10 + + addc 11,7,11 + adde 9,8,9 + addze 10,10 + std 11,16(3) + + ld 6,24(4) + mulld 7,5,6 + mulhdu 8,5,6 + addc 7,7,7 + adde 8,8,8 + addze 11,0 + + addc 9,7,9 + adde 10,8,10 + addze 11,11 + + ld 5,8(4) + ld 6,16(4) + mulld 7,5,6 + mulhdu 8,5,6 + + addc 7,7,7 + adde 8,8,8 + addze 11,11 + addc 9,7,9 + adde 10,8,10 + addze 11,11 + std 9,24(3) + + mulld 7,6,6 + mulhdu 8,6,6 + addc 10,7,10 + adde 11,8,11 + addze 9,0 + + ld 6,24(4) + mulld 7,5,6 + mulhdu 8,5,6 + addc 7,7,7 + adde 8,8,8 + addze 9,9 + + addc 10,7,10 + adde 11,8,11 + addze 9,9 + std 10,32(3) + + ld 5,16(4) + mulld 7,5,6 + mulhdu 8,5,6 + addc 7,7,7 + adde 8,8,8 + addze 10,0 + + addc 11,7,11 + adde 9,8,9 + addze 10,10 + std 11,40(3) + + mulld 7,6,6 + mulhdu 8,6,6 + addc 9,7,9 + adde 10,8,10 + + std 9,48(3) + std 10,56(3) + blr +.long 0 +.byte 0,12,0x14,0,0,0,2,0 +.long 0 +.size bn_sqr_comba4,.-bn_sqr_comba4 + + + + + + + + +.align 4 +bn_sqr_comba8: +.localentry bn_sqr_comba8,0 + + + + + + + + + + + + + + + + + + + + + xor 0,0,0 + + + + ld 5,0(4) + mulld 9,5,5 + mulhdu 10,5,5 + std 9,0(3) + + ld 6,8(4) + mulld 7,5,6 + mulhdu 8,5,6 + + addc 10,7,10 + adde 11,8,0 + addze 9,0 + + addc 10,7,10 + adde 11,8,11 + addze 9,9 + + std 10,8(3) + + + mulld 7,6,6 + mulhdu 8,6,6 + addc 11,7,11 + adde 9,8,9 + addze 10,0 + + ld 6,16(4) + mulld 7,5,6 + mulhdu 8,5,6 + + addc 11,7,11 + adde 9,8,9 + addze 10,10 + + addc 11,7,11 + adde 9,8,9 + addze 10,10 + + std 11,16(3) + + ld 6,24(4) + mulld 7,5,6 + mulhdu 8,5,6 + + addc 9,7,9 + adde 10,8,10 + addze 11,0 + + addc 9,7,9 + adde 10,8,10 + addze 11,11 + + ld 5,8(4) + ld 6,16(4) + mulld 7,5,6 + mulhdu 8,5,6 + + addc 9,7,9 + adde 10,8,10 + addze 11,11 + + addc 9,7,9 + adde 10,8,10 + addze 11,11 + + std 9,24(3) + + mulld 7,6,6 + mulhdu 8,6,6 + + addc 10,7,10 + adde 11,8,11 + addze 9,0 + + ld 6,24(4) + mulld 7,5,6 + mulhdu 8,5,6 + + addc 10,7,10 + adde 11,8,11 + addze 9,9 + + addc 10,7,10 + adde 11,8,11 + addze 9,9 + + ld 5,0(4) + ld 6,32(4) + mulld 7,5,6 + mulhdu 8,5,6 + + addc 10,7,10 + adde 11,8,11 + addze 9,9 + + addc 10,7,10 + adde 11,8,11 + addze 9,9 + std 10,32(3) + + ld 6,40(4) + mulld 7,5,6 + mulhdu 8,5,6 + + addc 11,7,11 + adde 9,8,9 + addze 10,0 + + addc 11,7,11 + adde 9,8,9 + addze 10,10 + + ld 5,8(4) + ld 6,32(4) + mulld 7,5,6 + mulhdu 8,5,6 + + addc 11,7,11 + adde 9,8,9 + addze 10,10 + + addc 11,7,11 + adde 9,8,9 + addze 10,10 + + ld 5,16(4) + ld 6,24(4) + mulld 7,5,6 + mulhdu 8,5,6 + + addc 11,7,11 + adde 9,8,9 + addze 10,10 + + addc 11,7,11 + adde 9,8,9 + addze 10,10 + std 11,40(3) + + mulld 7,6,6 + mulhdu 8,6,6 + addc 9,7,9 + adde 10,8,10 + addze 11,0 + + ld 6,32(4) + mulld 7,5,6 + mulhdu 8,5,6 + + addc 9,7,9 + adde 10,8,10 + addze 11,11 + + addc 9,7,9 + adde 10,8,10 + addze 11,11 + + ld 5,8(4) + ld 6,40(4) + mulld 7,5,6 + mulhdu 8,5,6 + + addc 9,7,9 + adde 10,8,10 + addze 11,11 + + addc 9,7,9 + adde 10,8,10 + addze 11,11 + + ld 5,0(4) + ld 6,48(4) + mulld 7,5,6 + mulhdu 8,5,6 + addc 9,7,9 + adde 10,8,10 + addze 11,11 + addc 9,7,9 + adde 10,8,10 + addze 11,11 + std 9,48(3) + + ld 6,56(4) + mulld 7,5,6 + mulhdu 8,5,6 + + addc 10,7,10 + adde 11,8,11 + addze 9,0 + addc 10,7,10 + adde 11,8,11 + addze 9,9 + + ld 5,8(4) + ld 6,48(4) + mulld 7,5,6 + mulhdu 8,5,6 + + addc 10,7,10 + adde 11,8,11 + addze 9,9 + addc 10,7,10 + adde 11,8,11 + addze 9,9 + + ld 5,16(4) + ld 6,40(4) + mulld 7,5,6 + mulhdu 8,5,6 + addc 10,7,10 + adde 11,8,11 + addze 9,9 + addc 10,7,10 + adde 11,8,11 + addze 9,9 + + ld 5,24(4) + ld 6,32(4) + mulld 7,5,6 + mulhdu 8,5,6 + + addc 10,7,10 + adde 11,8,11 + addze 9,9 + addc 10,7,10 + adde 11,8,11 + addze 9,9 + std 10,56(3) + + mulld 7,6,6 + mulhdu 8,6,6 + addc 11,7,11 + adde 9,8,9 + addze 10,0 + + ld 6,40(4) + mulld 7,5,6 + mulhdu 8,5,6 + addc 11,7,11 + adde 9,8,9 + addze 10,10 + addc 11,7,11 + adde 9,8,9 + addze 10,10 + + ld 5,16(4) + ld 6,48(4) + mulld 7,5,6 + mulhdu 8,5,6 + addc 11,7,11 + adde 9,8,9 + addze 10,10 + + addc 11,7,11 + adde 9,8,9 + addze 10,10 + + ld 5,8(4) + ld 6,56(4) + mulld 7,5,6 + mulhdu 8,5,6 + addc 11,7,11 + adde 9,8,9 + addze 10,10 + addc 11,7,11 + adde 9,8,9 + addze 10,10 + std 11,64(3) + + ld 5,16(4) + mulld 7,5,6 + mulhdu 8,5,6 + + addc 9,7,9 + adde 10,8,10 + addze 11,0 + addc 9,7,9 + adde 10,8,10 + addze 11,11 + + ld 5,24(4) + ld 6,48(4) + mulld 7,5,6 + mulhdu 8,5,6 + addc 9,7,9 + adde 10,8,10 + addze 11,11 + addc 9,7,9 + adde 10,8,10 + addze 11,11 + + ld 5,32(4) + ld 6,40(4) + mulld 7,5,6 + mulhdu 8,5,6 + addc 9,7,9 + adde 10,8,10 + addze 11,11 + addc 9,7,9 + adde 10,8,10 + addze 11,11 + std 9,72(3) + + mulld 7,6,6 + mulhdu 8,6,6 + addc 10,7,10 + adde 11,8,11 + addze 9,0 + + ld 6,48(4) + mulld 7,5,6 + mulhdu 8,5,6 + addc 10,7,10 + adde 11,8,11 + addze 9,9 + addc 10,7,10 + adde 11,8,11 + addze 9,9 + + ld 5,24(4) + ld 6,56(4) + mulld 7,5,6 + mulhdu 8,5,6 + addc 10,7,10 + adde 11,8,11 + addze 9,9 + addc 10,7,10 + adde 11,8,11 + addze 9,9 + std 10,80(3) + + ld 5,32(4) + mulld 7,5,6 + mulhdu 8,5,6 + addc 11,7,11 + adde 9,8,9 + addze 10,0 + addc 11,7,11 + adde 9,8,9 + addze 10,10 + + ld 5,40(4) + ld 6,48(4) + mulld 7,5,6 + mulhdu 8,5,6 + addc 11,7,11 + adde 9,8,9 + addze 10,10 + addc 11,7,11 + adde 9,8,9 + addze 10,10 + std 11,88(3) + + mulld 7,6,6 + mulhdu 8,6,6 + addc 9,7,9 + adde 10,8,10 + addze 11,0 + + ld 6,56(4) + mulld 7,5,6 + mulhdu 8,5,6 + addc 9,7,9 + adde 10,8,10 + addze 11,11 + addc 9,7,9 + adde 10,8,10 + addze 11,11 + std 9,96(3) + + + ld 5,48(4) + mulld 7,5,6 + mulhdu 8,5,6 + addc 10,7,10 + adde 11,8,11 + addze 9,0 + addc 10,7,10 + adde 11,8,11 + addze 9,9 + std 10,104(3) + + mulld 7,6,6 + mulhdu 8,6,6 + addc 11,7,11 + adde 9,8,9 + std 11,112(3) + std 9, 120(3) + + + blr +.long 0 +.byte 0,12,0x14,0,0,0,2,0 +.long 0 +.size bn_sqr_comba8,.-bn_sqr_comba8 + + + + + + + + +.align 4 +bn_mul_comba4: +.localentry bn_mul_comba4,0 + + + + + + + + + + + + + xor 0,0,0 + + ld 6,0(4) + ld 7,0(5) + mulld 10,6,7 + mulhdu 11,6,7 + std 10,0(3) + + ld 7,8(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,8,11 + adde 12,9,0 + addze 10,0 + + ld 6, 8(4) + ld 7, 0(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,8,11 + adde 12,9,12 + addze 10,10 + std 11,8(3) + + ld 6,16(4) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,8,12 + adde 10,9,10 + addze 11,0 + + ld 6,8(4) + ld 7,8(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,8,12 + adde 10,9,10 + addze 11,11 + + ld 6,0(4) + ld 7,16(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,8,12 + adde 10,9,10 + addze 11,11 + std 12,16(3) + + ld 7,24(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,8,10 + adde 11,9,11 + addze 12,0 + + ld 6,8(4) + ld 7,16(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,8,10 + adde 11,9,11 + addze 12,12 + + ld 6,16(4) + ld 7,8(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,8,10 + adde 11,9,11 + addze 12,12 + + ld 6,24(4) + ld 7,0(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,8,10 + adde 11,9,11 + addze 12,12 + std 10,24(3) + + ld 7,8(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,8,11 + adde 12,9,12 + addze 10,0 + + ld 6,16(4) + ld 7,16(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,8,11 + adde 12,9,12 + addze 10,10 + + ld 6,8(4) + ld 7,24(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,8,11 + adde 12,9,12 + addze 10,10 + std 11,32(3) + + ld 6,16(4) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,8,12 + adde 10,9,10 + addze 11,0 + + ld 6,24(4) + ld 7,16(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,8,12 + adde 10,9,10 + addze 11,11 + std 12,40(3) + + ld 7,24(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,8,10 + adde 11,9,11 + + std 10,48(3) + std 11,56(3) + blr +.long 0 +.byte 0,12,0x14,0,0,0,3,0 +.long 0 +.size bn_mul_comba4,.-bn_mul_comba4 + + + + + + + + +.align 4 +bn_mul_comba8: +.localentry bn_mul_comba8,0 + + + + + + + + + + + + + xor 0,0,0 + + + ld 6,0(4) + ld 7,0(5) + mulld 10,6,7 + mulhdu 11,6,7 + std 10,0(3) + + ld 7,8(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + addze 12,9 + addze 10,0 + + ld 6,8(4) + ld 7,0(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + std 11,8(3) + + ld 6,16(4) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,0 + + ld 6,8(4) + ld 7,8(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + + ld 6,0(4) + ld 7,16(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + std 12,16(3) + + ld 7,24(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,0 + + ld 6,8(4) + ld 7,16(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + + + ld 6,16(4) + ld 7,8(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + + ld 6,24(4) + ld 7,0(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + std 10,24(3) + + ld 6,32(4) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,0 + + ld 6,24(4) + ld 7,8(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + + ld 6,16(4) + ld 7,16(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + + ld 6,8(4) + ld 7,24(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + + ld 6,0(4) + ld 7,32(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + std 11,32(3) + + ld 7,40(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,0 + + ld 6,8(4) + ld 7,32(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + + ld 6,16(4) + ld 7,24(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + + ld 6,24(4) + ld 7,16(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + + ld 6,32(4) + ld 7,8(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + + ld 6,40(4) + ld 7,0(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + std 12,40(3) + + ld 6,48(4) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,0 + + ld 6,40(4) + ld 7,8(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + + ld 6,32(4) + ld 7,16(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + + ld 6,24(4) + ld 7,24(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + + ld 6,16(4) + ld 7,32(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + + ld 6,8(4) + ld 7,40(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + + ld 6,0(4) + ld 7,48(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + std 10,48(3) + + ld 7,56(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,0 + + ld 6,8(4) + ld 7,48(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + + ld 6,16(4) + ld 7,40(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + + ld 6,24(4) + ld 7,32(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + + ld 6,32(4) + ld 7,24(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + + ld 6,40(4) + ld 7,16(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + + ld 6,48(4) + ld 7,8(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + + ld 6,56(4) + ld 7,0(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + std 11,56(3) + + ld 7,8(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,0 + + ld 6,48(4) + ld 7,16(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + + ld 6,40(4) + ld 7,24(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + + ld 6,32(4) + ld 7,32(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + + ld 6,24(4) + ld 7,40(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + + ld 6,16(4) + ld 7,48(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + + ld 6,8(4) + ld 7,56(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + std 12,64(3) + + ld 6,16(4) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,0 + + ld 6,24(4) + ld 7,48(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + + ld 6,32(4) + ld 7,40(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + + ld 6,40(4) + ld 7,32(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + + ld 6,48(4) + ld 7,24(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + + ld 6,56(4) + ld 7,16(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + std 10,72(3) + + ld 7,24(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,0 + + ld 6,48(4) + ld 7,32(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + + ld 6,40(4) + ld 7,40(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + + ld 6,32(4) + ld 7,48(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + + ld 6,24(4) + ld 7,56(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + std 11,80(3) + + ld 6,32(4) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,0 + + ld 6,40(4) + ld 7,48(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + + ld 6,48(4) + ld 7,40(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + + ld 6,56(4) + ld 7,32(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + std 12,88(3) + + ld 7,40(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,0 + + ld 6,48(4) + ld 7,48(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + + ld 6,40(4) + ld 7,56(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + std 10,96(3) + + ld 6,48(4) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,0 + + ld 6,56(4) + ld 7,48(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + std 11,104(3) + + ld 7,56(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + std 12,112(3) + std 10,120(3) + blr +.long 0 +.byte 0,12,0x14,0,0,0,3,0 +.long 0 +.size bn_mul_comba8,.-bn_mul_comba8 + + + + + + + + +.align 4 +bn_sub_words: +.localentry bn_sub_words,0 + + + + + + + + + + + + + + + xor 0,0,0 + + + + subfc. 7,0,6 + + + beq .Lppcasm_sub_adios + addi 4,4,-8 + addi 3,3,-8 + addi 5,5,-8 + mtctr 6 +.Lppcasm_sub_mainloop: + ldu 7,8(4) + ldu 8,8(5) + subfe 6,8,7 + + + stdu 6,8(3) + bdnz .Lppcasm_sub_mainloop +.Lppcasm_sub_adios: + subfze 3,0 + andi. 3,3,1 + blr +.long 0 +.byte 0,12,0x14,0,0,0,4,0 +.long 0 +.size bn_sub_words,.-bn_sub_words + + + + + + + + +.align 4 +bn_add_words: +.localentry bn_add_words,0 + + + + + + + + + + + + + + + xor 0,0,0 + + + + addic. 6,6,0 + beq .Lppcasm_add_adios + addi 4,4,-8 + addi 3,3,-8 + addi 5,5,-8 + mtctr 6 +.Lppcasm_add_mainloop: + ldu 7,8(4) + ldu 8,8(5) + adde 8,7,8 + stdu 8,8(3) + bdnz .Lppcasm_add_mainloop +.Lppcasm_add_adios: + addze 3,0 + blr +.long 0 +.byte 0,12,0x14,0,0,0,4,0 +.long 0 +.size bn_add_words,.-bn_add_words + + + + + + + + +.align 4 +bn_div_words: +.localentry bn_div_words,0 + + + + + + + + + + + + + cmpldi 0,5,0 + bne .Lppcasm_div1 + li 3,-1 + blr +.Lppcasm_div1: + xor 0,0,0 + li 8,64 + cntlzd. 7,5 + beq .Lppcasm_div2 + subf 8,7,8 + srd. 9,3,8 + td 16,9,0 +.Lppcasm_div2: + cmpld 0,3,5 + blt .Lppcasm_div3 + subf 3,5,3 +.Lppcasm_div3: + cmpi 0,0,7,0 + beq .Lppcasm_div4 + sld 3,3,7 + srd 8,4,8 + sld 5,5,7 + or 3,3,8 + sld 4,4,7 +.Lppcasm_div4: + srdi 9,5,32 + + + li 6,2 + mtctr 6 +.Lppcasm_divouterloop: + srdi 8,3,32 + srdi 11,4,32 + + cmpld 0,8,9 + bne .Lppcasm_div5 + + li 8,-1 + clrldi 8,8,32 + b .Lppcasm_div6 +.Lppcasm_div5: + divdu 8,3,9 +.Lppcasm_div6: + mulld 12,9,8 + clrldi 10,5,32 + mulld 6,8,10 + +.Lppcasm_divinnerloop: + subf 10,12,3 + srdi 7,10,32 + addic. 7,7,0 + + + + sldi 7,10,32 + or 7,7,11 + cmpld 1,6,7 + bne .Lppcasm_divinnerexit + ble 1,.Lppcasm_divinnerexit + addi 8,8,-1 + subf 12,9,12 + clrldi 10,5,32 + subf 6,10,6 + b .Lppcasm_divinnerloop +.Lppcasm_divinnerexit: + srdi 10,6,32 + sldi 11,6,32 + cmpld 1,4,11 + add 12,12,10 + bge 1,.Lppcasm_div7 + addi 12,12,1 +.Lppcasm_div7: + subf 11,11,4 + cmpld 1,3,12 + bge 1,.Lppcasm_div8 + addi 8,8,-1 + add 3,5,3 +.Lppcasm_div8: + subf 12,12,3 + sldi 4,11,32 + + + + insrdi 11,12,32,32 + rotldi 3,11,32 + bdz .Lppcasm_div9 + sldi 0,8,32 + b .Lppcasm_divouterloop +.Lppcasm_div9: + or 3,8,0 + blr +.long 0 +.byte 0,12,0x14,0,0,0,3,0 +.long 0 +.size bn_div_words,.-bn_div_words + + + + + + + +.align 4 +bn_sqr_words: +.localentry bn_sqr_words,0 + + + + + + + + + + + + + + + + addic. 5,5,0 + beq .Lppcasm_sqr_adios + addi 4,4,-8 + addi 3,3,-8 + mtctr 5 +.Lppcasm_sqr_mainloop: + + ldu 6,8(4) + mulld 7,6,6 + mulhdu 8,6,6 + stdu 7,8(3) + stdu 8,8(3) + bdnz .Lppcasm_sqr_mainloop +.Lppcasm_sqr_adios: + blr +.long 0 +.byte 0,12,0x14,0,0,0,3,0 +.long 0 +.size bn_sqr_words,.-bn_sqr_words + + + + + + + + +.align 4 +bn_mul_words: +.localentry bn_mul_words,0 + + + + + + + + + xor 0,0,0 + xor 12,12,12 + rlwinm. 7,5,30,2,31 + beq .Lppcasm_mw_REM + mtctr 7 +.Lppcasm_mw_LOOP: + + ld 8,0(4) + mulld 9,6,8 + mulhdu 10,6,8 + addc 9,9,12 + + + + + std 9,0(3) + + ld 8,8(4) + mulld 11,6,8 + mulhdu 12,6,8 + adde 11,11,10 + + std 11,8(3) + + ld 8,16(4) + mulld 9,6,8 + mulhdu 10,6,8 + adde 9,9,12 + + std 9,16(3) + + ld 8,24(4) + mulld 11,6,8 + mulhdu 12,6,8 + adde 11,11,10 + addze 12,12 + + std 11,24(3) + + addi 3,3,32 + addi 4,4,32 + bdnz .Lppcasm_mw_LOOP + +.Lppcasm_mw_REM: + andi. 5,5,0x3 + beq .Lppcasm_mw_OVER + + ld 8,0(4) + mulld 9,6,8 + mulhdu 10,6,8 + addc 9,9,12 + addze 10,10 + std 9,0(3) + addi 12,10,0 + + addi 5,5,-1 + cmpli 0,0,5,0 + beq .Lppcasm_mw_OVER + + + + ld 8,8(4) + mulld 9,6,8 + mulhdu 10,6,8 + addc 9,9,12 + addze 10,10 + std 9,8(3) + addi 12,10,0 + + addi 5,5,-1 + cmpli 0,0,5,0 + beq .Lppcasm_mw_OVER + + + ld 8,16(4) + mulld 9,6,8 + mulhdu 10,6,8 + addc 9,9,12 + addze 10,10 + std 9,16(3) + addi 12,10,0 + +.Lppcasm_mw_OVER: + addi 3,12,0 + blr +.long 0 +.byte 0,12,0x14,0,0,0,4,0 +.long 0 +.size bn_mul_words,.-bn_mul_words + + + + + + + + +.align 4 +bn_mul_add_words: +.localentry bn_mul_add_words,0 + + + + + + + + + + + + xor 0,0,0 + xor 12,12,12 + rlwinm. 7,5,30,2,31 + beq .Lppcasm_maw_leftover + mtctr 7 +.Lppcasm_maw_mainloop: + + ld 8,0(4) + ld 11,0(3) + mulld 9,6,8 + mulhdu 10,6,8 + addc 9,9,12 + addze 10,10 + addc 9,9,11 + + + + + + + std 9,0(3) + + + ld 8,8(4) + ld 9,8(3) + mulld 11,6,8 + mulhdu 12,6,8 + adde 11,11,10 + addze 12,12 + addc 11,11,9 + + std 11,8(3) + + + ld 8,16(4) + mulld 9,6,8 + ld 11,16(3) + mulhdu 10,6,8 + adde 9,9,12 + addze 10,10 + addc 9,9,11 + + std 9,16(3) + + + ld 8,24(4) + mulld 11,6,8 + ld 9,24(3) + mulhdu 12,6,8 + adde 11,11,10 + addze 12,12 + addc 11,11,9 + addze 12,12 + std 11,24(3) + addi 3,3,32 + addi 4,4,32 + bdnz .Lppcasm_maw_mainloop + +.Lppcasm_maw_leftover: + andi. 5,5,0x3 + beq .Lppcasm_maw_adios + addi 3,3,-8 + addi 4,4,-8 + + mtctr 5 + ldu 8,8(4) + mulld 9,6,8 + mulhdu 10,6,8 + ldu 11,8(3) + addc 9,9,11 + addze 10,10 + addc 9,9,12 + addze 12,10 + std 9,0(3) + + bdz .Lppcasm_maw_adios + + ldu 8,8(4) + mulld 9,6,8 + mulhdu 10,6,8 + ldu 11,8(3) + addc 9,9,11 + addze 10,10 + addc 9,9,12 + addze 12,10 + std 9,0(3) + + bdz .Lppcasm_maw_adios + + ldu 8,8(4) + mulld 9,6,8 + mulhdu 10,6,8 + ldu 11,8(3) + addc 9,9,11 + addze 10,10 + addc 9,9,12 + addze 12,10 + std 9,0(3) + +.Lppcasm_maw_adios: + addi 3,12,0 + blr +.long 0 +.byte 0,12,0x14,0,0,0,4,0 +.long 0 +.size bn_mul_add_words,.-bn_mul_add_words +.align 4 diff --git a/sys/crypto/openssl/powerpc64le/ecp_nistp521-ppc64.S b/sys/crypto/openssl/powerpc64le/ecp_nistp521-ppc64.S new file mode 100644 --- /dev/null +++ b/sys/crypto/openssl/powerpc64le/ecp_nistp521-ppc64.S @@ -0,0 +1,354 @@ +/* Do not modify. This file is auto-generated from ecp_nistp521-ppc64.pl. */ +.machine "any" +.abiversion 2 +.text + +.globl p521_felem_mul +.type p521_felem_mul,@function +.align 5 +p521_felem_mul: +.localentry p521_felem_mul,0 + + + mr 12,1 + stdu 1,-16*13(1) + + stxv 52,-16*12(12) + stxv 53,-16*11(12) + stxv 54,-16*10(12) + stxv 55,-16*9(12) + stxv 56,-16*8(12) + stxv 57,-16*7(12) + stxv 58,-16*6(12) + stxv 59,-16*5(12) + stxv 60,-16*4(12) + stxv 61,-16*3(12) + stxv 62,-16*2(12) + stxv 63,-16*1(12) + + vspltisw 0,0 + + lxsd 13,0(4) + lxsd 14,8(4) + lxsd 15,16(4) + lxsd 16,24(4) + lxsd 17,32(4) + lxsd 18,40(4) + lxsd 19,48(4) + lxsd 20,56(4) + lxsd 21,64(4) + + lxsd 3,0(5) + lxsd 4,8(5) + lxsd 5,16(5) + lxsd 6,24(5) + lxsd 7,32(5) + lxsd 8,40(5) + lxsd 9,48(5) + lxsd 10,56(5) + lxsd 11,64(5) + + .long 0x12ED1823 + + xxpermdi 33,45,46,0b00 + xxpermdi 34,36,35,0b00 + .long 0x13011023 + + xxpermdi 34,37,36,0b00 + .long 0x13211023 + .long 0x132F1E63 + + xxpermdi 34,38,37,0b00 + .long 0x13411023 + xxpermdi 44,47,48,0b00 + xxpermdi 54,36,35,0b00 + .long 0x134CB6A3 + + xxpermdi 34,39,38,0b00 + .long 0x13611023 + xxpermdi 54,37,36,0b00 + .long 0x136CB6E3 + .long 0x13711EE3 + + xxpermdi 34,40,39,0b00 + .long 0x13811023 + xxpermdi 54,38,37,0b00 + .long 0x138CB723 + + xxpermdi 34,41,40,0b00 + .long 0x13A11023 + xxpermdi 54,39,38,0b00 + .long 0x13ACB763 + + xxpermdi 34,42,41,0b00 + .long 0x13C11023 + xxpermdi 54,40,39,0b00 + .long 0x13CCB7A3 + + xxpermdi 34,43,42,0b00 + .long 0x13E11023 + xxpermdi 54,41,40,0b00 + .long 0x13ECB7E3 + + xxpermdi 33,49,50,0b00 + xxpermdi 34,36,35,0b00 + .long 0x13811723 + + xxpermdi 34,37,36,0b00 + .long 0x13A11763 + .long 0x13B31F63 + + xxpermdi 34,38,37,0b00 + .long 0x13C117A3 + xxpermdi 44,51,52,0b00 + xxpermdi 54,36,35,0b00 + .long 0x13CCB7A3 + + xxpermdi 34,39,38,0b00 + .long 0x13E117E3 + xxpermdi 54,37,36,0b00 + .long 0x13ECB7E3 + .long 0x13F51FE3 + + li 8,0 + li 9,1 + mtvsrdd 33,9,8 + .long 0x10630DC4 + .long 0x10840DC4 + .long 0x10A50DC4 + .long 0x10C60DC4 + .long 0x10E70DC4 + .long 0x11080DC4 + .long 0x11290DC4 + .long 0x114A0DC4 + .long 0x116B0DC4 + + .long 0x13D55FA3 + + xxpermdi 34,43,42,0b00 + xxpermdi 33,52,53,0b00 + .long 0x13A11763 + + xxpermdi 33,51,52,0b00 + .long 0x13811723 + .long 0x13954F23 + + xxpermdi 33,50,51,0b00 + .long 0x136116E3 + xxpermdi 54,41,40,0b00 + xxpermdi 44,52,53,0b00 + .long 0x136CB6E3 + + xxpermdi 33,49,50,0b00 + .long 0x134116A3 + xxpermdi 44,51,52,0b00 + .long 0x134CB6A3 + .long 0x13553EA3 + + xxpermdi 33,48,49,0b00 + .long 0x13211663 + xxpermdi 44,50,51,0b00 + .long 0x132CB663 + + xxpermdi 33,47,48,0b00 + .long 0x13011623 + xxpermdi 44,49,50,0b00 + .long 0x130CB623 + + xxpermdi 33,46,47,0b00 + .long 0x12E115E3 + xxpermdi 44,48,49,0b00 + .long 0x12ECB5E3 + + xxpermdi 34,39,38,0b00 + xxpermdi 33,52,53,0b00 + .long 0x13211663 + + xxpermdi 33,51,52,0b00 + .long 0x13011623 + .long 0x13152E23 + + xxpermdi 33,50,51,0b00 + .long 0x12E115E3 + xxpermdi 54,37,36,0b00 + xxpermdi 44,52,53,0b00 + .long 0x12ECB5E3 + + stxv 55,0(3) + stxv 56,16(3) + stxv 57,32(3) + stxv 58,48(3) + stxv 59,64(3) + stxv 60,80(3) + stxv 61,96(3) + stxv 62,112(3) + stxv 63,128(3) + + ld 12,0(1) + lxv 52,-16*12(12) + lxv 53,-16*11(12) + lxv 54,-16*10(12) + lxv 55,-16*9(12) + lxv 56,-16*8(12) + lxv 57,-16*7(12) + lxv 58,-16*6(12) + lxv 59,-16*5(12) + lxv 60,-16*4(12) + lxv 61,-16*3(12) + lxv 62,-16*2(12) + lxv 63,-16*1(12) + mr 1,12 + + blr +.size p521_felem_mul,.-p521_felem_mul + +.globl p521_felem_square +.type p521_felem_square,@function +.align 5 +p521_felem_square: +.localentry p521_felem_square,0 + + + mr 12,1 + stdu 1,-16*13(1) + + stxv 52,-16*12(12) + stxv 53,-16*11(12) + stxv 54,-16*10(12) + stxv 55,-16*9(12) + stxv 56,-16*8(12) + stxv 57,-16*7(12) + stxv 58,-16*6(12) + stxv 59,-16*5(12) + stxv 60,-16*4(12) + stxv 61,-16*3(12) + stxv 62,-16*2(12) + stxv 63,-16*1(12) + + vspltisw 0,0 + + lxsd 13,0(4) + lxsd 14,8(4) + lxsd 15,16(4) + lxsd 16,24(4) + lxsd 17,32(4) + lxsd 18,40(4) + lxsd 19,48(4) + lxsd 20,56(4) + lxsd 21,64(4) + + li 8,0 + li 9,1 + mtvsrdd 33,9,8 + .long 0x106D0DC4 + .long 0x108E0DC4 + .long 0x10AF0DC4 + .long 0x10D00DC4 + .long 0x10F10DC4 + .long 0x11120DC4 + .long 0x11330DC4 + .long 0x11540DC4 + .long 0x11750DC4 + .long 0x12ED6823 + + .long 0x130D2023 + + xxpermdi 33,45,46,0b00 + xxpermdi 34,37,46,0b00 + .long 0x13211023 + + xxpermdi 34,38,37,0b00 + .long 0x13411023 + + xxpermdi 34,39,38,0b00 + .long 0x13611023 + .long 0x136F7EE3 + + xxpermdi 34,40,39,0b00 + .long 0x13811023 + .long 0x138F3723 + + xxpermdi 34,41,40,0b00 + .long 0x13A11023 + xxpermdi 44,47,48,0b00 + xxpermdi 54,39,48,0b00 + .long 0x13ACB763 + + xxpermdi 34,42,41,0b00 + .long 0x13C11023 + xxpermdi 54,40,39,0b00 + .long 0x13CCB7A3 + + xxpermdi 34,43,42,0b00 + .long 0x13E11023 + xxpermdi 54,41,40,0b00 + .long 0x13ECB7E3 + .long 0x13F18FE3 + + .long 0x13124623 + + .long 0x13534EA3 + + .long 0x13945723 + + .long 0x13D55FA3 + + mtvsrdd 33,9,8 + .long 0x11080DC4 + .long 0x11290DC4 + .long 0x114A0DC4 + .long 0x116B0DC4 + + .long 0x13B45F63 + + .long 0x13935F23 + + xxpermdi 34,43,42,0b00 + xxpermdi 33,50,51,0b00 + .long 0x136116E3 + + xxpermdi 33,49,50,0b00 + .long 0x134116A3 + + xxpermdi 33,48,49,0b00 + .long 0x13211663 + .long 0x13324E63 + + xxpermdi 33,47,48,0b00 + .long 0x13011623 + .long 0x13114E23 + + xxpermdi 33,46,47,0b00 + .long 0x12E115E3 + xxpermdi 34,41,40,0b00 + xxpermdi 33,48,49,0b00 + .long 0x12E115E3 + + stxv 55,0(3) + stxv 56,16(3) + stxv 57,32(3) + stxv 58,48(3) + stxv 59,64(3) + stxv 60,80(3) + stxv 61,96(3) + stxv 62,112(3) + stxv 63,128(3) + + ld 12,0(1) + lxv 52,-16*12(12) + lxv 53,-16*11(12) + lxv 54,-16*10(12) + lxv 55,-16*9(12) + lxv 56,-16*8(12) + lxv 57,-16*7(12) + lxv 58,-16*6(12) + lxv 59,-16*5(12) + lxv 60,-16*4(12) + lxv 61,-16*3(12) + lxv 62,-16*2(12) + lxv 63,-16*1(12) + mr 1,12 + + blr +.size p521_felem_square,.-p521_felem_square + diff --git a/sys/crypto/openssl/powerpc64le/keccak1600-ppc64.S b/sys/crypto/openssl/powerpc64le/keccak1600-ppc64.S --- a/sys/crypto/openssl/powerpc64le/keccak1600-ppc64.S +++ b/sys/crypto/openssl/powerpc64le/keccak1600-ppc64.S @@ -304,19 +304,19 @@ dword_le_load: .localentry dword_le_load,0 - lbzu 0,1(3) - lbzu 4,1(3) - lbzu 5,1(3) + lbz 0,1(3) + lbz 4,2(3) + lbz 5,3(3) insrdi 0,4,8,48 - lbzu 4,1(3) + lbz 4,4(3) insrdi 0,5,8,40 - lbzu 5,1(3) + lbz 5,5(3) insrdi 0,4,8,32 - lbzu 4,1(3) + lbz 4,6(3) insrdi 0,5,8,24 - lbzu 5,1(3) + lbz 5,7(3) insrdi 0,4,8,16 - lbzu 4,1(3) + lbzu 4,8(3) insrdi 0,5,8,8 insrdi 0,4,8,0 blr @@ -579,21 +579,21 @@ cmpldi 30,8 blt .Lsqueeze_tail - stbu 0,1(29) + stb 0,1(29) srdi 0,0,8 - stbu 0,1(29) + stb 0,2(29) srdi 0,0,8 - stbu 0,1(29) + stb 0,3(29) srdi 0,0,8 - stbu 0,1(29) + stb 0,4(29) srdi 0,0,8 - stbu 0,1(29) + stb 0,5(29) srdi 0,0,8 - stbu 0,1(29) + stb 0,6(29) srdi 0,0,8 - stbu 0,1(29) + stb 0,7(29) srdi 0,0,8 - stbu 0,1(29) + stbu 0,8(29) subic. 30,30,8 beq .Lsqueeze_done diff --git a/sys/crypto/openssl/powerpc64le/poly1305-ppc.S b/sys/crypto/openssl/powerpc64le/poly1305-ppc.S --- a/sys/crypto/openssl/powerpc64le/poly1305-ppc.S +++ b/sys/crypto/openssl/powerpc64le/poly1305-ppc.S @@ -12,6 +12,7 @@ std 0,0(3) std 0,8(3) std 0,16(3) + stw 0,24(3) cmpld 4,0 beq- .Lno_key @@ -41,6 +42,7 @@ poly1305_blocks: .localentry poly1305_blocks,0 +.Lpoly1305_blocks: srdi. 5,5,4 beq- .Labort @@ -124,39 +126,1003 @@ .long 0 .byte 0,12,4,1,0x80,5,4,0 .size poly1305_blocks,.-poly1305_blocks - .globl poly1305_emit .type poly1305_emit,@function -.align 4 +.align 5 poly1305_emit: .localentry poly1305_emit,0 - ld 7,0(3) - ld 8,8(3) - ld 9,16(3) - ld 6,0(5) - ld 5,8(5) + lwz 7,0(3) + lwz 8,4(3) + lwz 9,8(3) + lwz 10,12(3) + lwz 11,16(3) + lwz 0,24(3) + + sldi 8,8,26 + sldi 12,9,52 + srdi 9,9,12 + sldi 10,10,14 + add 7,7,8 + addc 7,7,12 + sldi 12,11,40 + srdi 11,11,24 + adde 8,9,10 + addc 8,8,12 + addze 9,11 + + ld 10,0(3) + ld 11,8(3) + ld 12,16(3) + + neg 0,0 + xor 7,7,10 + xor 8,8,11 + xor 9,9,12 + and 7,7,0 + and 8,8,0 + and 9,9,0 + xor 7,7,10 + xor 8,8,11 + xor 9,9,12 addic 10,7,5 addze 11,8 addze 12,9 - srdi 0,12,2 - neg 0,0 + srdi 12,12,2 + neg 12,12 - andc 7,7,0 - and 10,10,0 - andc 8,8,0 - and 11,11,0 + andc 7,7,12 + and 10,10,12 + andc 8,8,12 + and 11,11,12 or 7,7,10 or 8,8,11 - addc 7,7,6 - adde 8,8,5 - std 7,0(4) - std 8,8(4) + + lwz 12,4(5) + lwz 9,12(5) + lwz 10,0(5) + lwz 11,8(5) + + insrdi 10,12,32,0 + insrdi 11,9,32,0 + + addc 7,7,10 + adde 8,8,11 + + addi 3,4,-1 + addi 4,4,7 + + stbu 7,1(3) + srdi 7,7,8 + stbu 8,1(4) + srdi 8,8,8 + + stbu 7,1(3) + srdi 7,7,8 + stbu 8,1(4) + srdi 8,8,8 + + stbu 7,1(3) + srdi 7,7,8 + stbu 8,1(4) + srdi 8,8,8 + + stbu 7,1(3) + srdi 7,7,8 + stbu 8,1(4) + srdi 8,8,8 + + stbu 7,1(3) + srdi 7,7,8 + stbu 8,1(4) + srdi 8,8,8 + + stbu 7,1(3) + srdi 7,7,8 + stbu 8,1(4) + srdi 8,8,8 + + stbu 7,1(3) + srdi 7,7,8 + stbu 8,1(4) + srdi 8,8,8 + + stbu 7,1(3) + stbu 8,1(4) + blr .long 0 .byte 0,12,0x14,0,0,0,3,0 .size poly1305_emit,.-poly1305_emit -.byte 80,111,108,121,49,51,48,53,32,102,111,114,32,80,80,67,44,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.globl poly1305_blocks_vsx +.type poly1305_blocks_vsx,@function +.align 5 +poly1305_blocks_vsx: +.localentry poly1305_blocks_vsx,0 + + lwz 7,24(3) + cmpldi 5,128 + bge __poly1305_blocks_vsx + + neg 0,7 + lwz 7,0(3) + lwz 8,4(3) + lwz 9,8(3) + lwz 10,12(3) + lwz 11,16(3) + + sldi 8,8,26 + sldi 12,9,52 + add 7,7,8 + srdi 9,9,12 + sldi 10,10,14 + addc 7,7,12 + sldi 8,11,40 + adde 9,9,10 + srdi 11,11,24 + addc 9,9,8 + addze 11,11 + + ld 8,0(3) + ld 10,8(3) + ld 12,16(3) + + xor 7,7,8 + xor 9,9,10 + xor 11,11,12 + and 7,7,0 + and 9,9,0 + and 11,11,0 + xor 7,7,8 + xor 9,9,10 + xor 11,11,12 + + li 0,0 + std 7,0(3) + std 9,8(3) + std 11,16(3) + stw 0,24(3) + + b .Lpoly1305_blocks +.long 0 +.byte 0,12,0x14,0,0,0,4,0 +.size poly1305_blocks_vsx,.-poly1305_blocks_vsx + +.align 5 +__poly1305_mul: + mulld 9,6,27 + mulhdu 10,6,27 + + mulld 30,7,29 + mulhdu 31,7,29 + addc 9,9,30 + adde 10,10,31 + + mulld 30,6,28 + mulhdu 11,6,28 + addc 10,10,30 + addze 11,11 + + mulld 30,7,27 + mulhdu 31,7,27 + addc 10,10,30 + adde 11,11,31 + + mulld 30,8,29 + mulld 31,8,27 + addc 10,10,30 + adde 11,11,31 + + andc 30,11,0 + and 8,11,0 + srdi 31,30,2 + add 30,30,31 + addc 6,9,30 + addze 7,10 + addze 8,8 + + blr +.long 0 +.byte 0,12,0x14,0,0,0,0,0 +.size __poly1305_mul,.-__poly1305_mul + +.align 5 +__poly1305_splat: + rldicl 9,6,0,38 + rldicl 10,6,38,38 + stw 9,0x00(31) + + rldicl 11,6,12,52 + slwi 9,10,2 + stw 10,0x10(31) + add 9,9,10 + stw 9,0x20(31) + + insrdi 11,7,14,38 + slwi 9,11,2 + stw 11,0x30(31) + add 9,9,11 + stw 9,0x40(31) + + rldicl 10,7,50,38 + rldicl 11,7,24,40 + slwi 9,10,2 + stw 10,0x50(31) + add 9,9,10 + stw 9,0x60(31) + + insrdi 11,8,3,37 + slwi 9,11,2 + stw 11,0x70(31) + add 9,9,11 + stw 9,0x80(31) + + blr +.long 0 +.byte 0,12,0x14,0,0,0,0,0 +.size __poly1305_splat,.-__poly1305_splat + +.align 5 +__poly1305_blocks_vsx: + stdu 1,-432(1) + mflr 0 + li 10,191 + li 11,207 + li 12,-1 + stvx 20,10,1 + addi 10,10,32 + stvx 21,11,1 + addi 11,11,32 + stvx 22,10,1 + addi 10,10,32 + stvx 23,10,1 + addi 10,10,32 + stvx 24,11,1 + addi 11,11,32 + stvx 25,10,1 + addi 10,10,32 + stvx 26,10,1 + addi 10,10,32 + stvx 27,11,1 + addi 11,11,32 + stvx 28,10,1 + addi 10,10,32 + stvx 29,11,1 + addi 11,11,32 + stvx 30,10,1 + stvx 31,11,1 + stw 12,388(1) + li 12,-1 + or 12,12,12 + std 27,392(1) + std 28,400(1) + std 29,408(1) + std 30,416(1) + std 31,424(1) + std 0,448(1) + + bl .LPICmeup + + li 27,0x10 + li 28,0x20 + li 29,0x30 + li 30,0x40 + li 31,0x50 + .long 0x7FA06699 + .long 0x7F3B6699 + .long 0x7F7C6699 + .long 0x7FFD6699 + .long 0x7FDE6699 + + cmplwi 7,0 + bne .Lskip_init_vsx + + ld 27,32(3) + ld 28,40(3) + srdi 29,28,2 + li 0,3 + add 29,29,28 + + mr 6,27 + mr 7,28 + li 8,0 + addi 31,3,60 + bl __poly1305_splat + + bl __poly1305_mul + addi 31,3,52 + bl __poly1305_splat + + bl __poly1305_mul + addi 31,3,56 + bl __poly1305_splat + + bl __poly1305_mul + addi 31,3,48 + bl __poly1305_splat + + ld 6,0(3) + ld 7,8(3) + ld 8,16(3) + + rldicl 9,6,0,38 + rldicl 10,6,38,38 + rldicl 11,6,12,52 + .long 0x7C0901E7 + insrdi 11,7,14,38 + .long 0x7C2A01E7 + rldicl 10,7,50,38 + .long 0x7C4B01E7 + rldicl 11,7,24,40 + .long 0x7C6A01E7 + insrdi 11,8,3,37 + .long 0x7C8B01E7 + li 0,1 + stw 0,24(3) + b .Loaded_vsx + +.align 4 +.Lskip_init_vsx: + li 27,4 + li 28,8 + li 29,12 + li 30,16 + .long 0x7C001819 + .long 0x7C3B1819 + .long 0x7C5C1819 + .long 0x7C7D1819 + .long 0x7C9E1819 + +.Loaded_vsx: + li 27,0x10 + li 28,0x20 + li 29,0x30 + li 30,0x40 + li 31,0x50 + li 7,0x60 + li 8,0x70 + addi 10,3,64 + addi 11,1,63 + + vxor 20,20,20 + .long 0xF000A057 + .long 0xF021A057 + .long 0xF042A057 + .long 0xF063A057 + .long 0xF084A057 + + + .long 0x7EA02699 + .long 0x7EDB2699 + .long 0x7EFC2699 + .long 0x7F1D2699 + + + + + + .long 0xF0B5B057 + vspltisb 26,4 + vperm 7,21,22,31 + vspltisb 28,14 + .long 0xF115B357 + + .long 0x10C5CEC4 + .long 0x10E7D6C4 + .long 0x1128DEC4 + .long 0x1108E6C4 + vand 5,5,29 + vand 6,6,29 + vand 7,7,29 + vand 8,8,29 + + .long 0xF2B7C057 + vperm 22,23,24,31 + .long 0xF2F7C357 + + .long 0x1295CEC4 + .long 0x12D6D6C4 + .long 0x1317DEC4 + .long 0x12F7E6C4 + vand 21,21,29 + vand 20,20,29 + vand 22,22,29 + vand 23,23,29 + + + .long 0x11384E8C + .long 0x10B52E8C + .long 0x10D4368C + .long 0x10F63E8C + .long 0x1117468C + vor 9,9,30 + + .long 0x7D5D1A99 + .long 0x7D605299 + .long 0x7D9B5299 + .long 0x7DBC5299 + .long 0x7DDD5299 + .long 0x7EBE5299 + .long 0x7EDF5299 + .long 0x7EE75299 + .long 0x7F085299 + stvx 11,0,11 + stvx 12,27,11 + stvx 13,28,11 + stvx 14,29,11 + stvx 21,30,11 + stvx 22,31,11 + stvx 23,7,11 + stvx 24,8,11 + + addi 4,4,0x40 + addi 12,12,0x50 + addi 0,5,-64 + srdi 0,0,6 + mtctr 0 + b .Loop_vsx + +.align 4 +.Loop_vsx: + + + + + + + + + + + + + + + .long 0x11E55288 + .long 0x12055A88 + .long 0x12256A88 + .long 0x12466A88 + + .long 0x12865288 + .long 0x1210A0C0 + .long 0x12865A88 + .long 0x1231A0C0 + .long 0x12676A88 + .long 0x12896288 + .long 0x11EFA0C0 + .long 0x12875A88 + .long 0x1252A0C0 + lvx 12,31,11 + .long 0x12885A88 + .long 0x1273A0C0 + lvx 11,30,11 + + .long 0x104238C0 + .long 0x100028C0 + .long 0x106340C0 + .long 0x102130C0 + .long 0x108448C0 + + .long 0x12887288 + .long 0x11EFA0C0 + .long 0x12897288 + .long 0x1210A0C0 + .long 0x12875288 + .long 0x1231A0C0 + .long 0x12885288 + .long 0x1252A0C0 + lvx 14,8,11 + .long 0x12895288 + .long 0x1273A0C0 + lvx 13,7,11 + + .long 0x12876288 + .long 0x11EFA0C0 + .long 0x12886288 + .long 0x1210A0C0 + .long 0x12896288 + .long 0x1231A0C0 + .long 0x12855A88 + .long 0x1252A0C0 + .long 0x12865A88 + .long 0x1273A0C0 + + + .long 0x7EA02699 + .long 0x7EDB2699 + .long 0x7EFC2699 + .long 0x7F1D2699 + + + + + + .long 0x12867288 + .long 0x11EFA0C0 + .long 0x12877288 + .long 0x1210A0C0 + .long 0x12887288 + .long 0x1231A0C0 + .long 0x12897288 + .long 0x1252A0C0 + .long 0x12856A88 + .long 0x1273A0C0 + + .long 0xF0B5B057 + vspltisb 26,4 + vperm 7,21,22,31 + .long 0xF115B357 + + + .long 0x12805088 + .long 0x11EFA0C0 + .long 0x12815088 + .long 0x1210A0C0 + .long 0x12825088 + .long 0x1231A0C0 + .long 0x12835088 + .long 0x1252A0C0 + .long 0x12845088 + .long 0x1273A0C0 + + .long 0xF2B7C057 + vperm 22,23,24,31 + .long 0xF2F7C357 + + .long 0x12826088 + .long 0x11EFA0C0 + .long 0x12836088 + .long 0x1210A0C0 + .long 0x12846088 + .long 0x1231A0C0 + .long 0x12805888 + .long 0x1252A0C0 + lvx 12,27,11 + .long 0x12815888 + .long 0x1273A0C0 + lvx 11,0,11 + + .long 0x10C5CEC4 + .long 0x10E7D6C4 + .long 0x1128DEC4 + .long 0x1108E6C4 + + .long 0x12817088 + .long 0x11EFA0C0 + .long 0x12827088 + .long 0x1210A0C0 + .long 0x12837088 + .long 0x1231A0C0 + .long 0x12847088 + .long 0x1252A0C0 + lvx 14,29,11 + .long 0x12806888 + .long 0x1273A0C0 + lvx 13,28,11 + + vand 5,5,29 + vand 6,6,29 + vand 7,7,29 + vand 8,8,29 + + .long 0x12846088 + .long 0x11EFA0C0 + .long 0x12805888 + .long 0x1210A0C0 + .long 0x12815888 + .long 0x1231A0C0 + .long 0x12825888 + .long 0x1252A0C0 + .long 0x12835888 + .long 0x1273A0C0 + + .long 0x12D6D6C4 + .long 0x1355CEC4 + .long 0x1317DEC4 + .long 0x12F7E6C4 + + .long 0x12837088 + .long 0x11EFA0C0 + .long 0x12847088 + .long 0x1210A0C0 + .long 0x12806888 + .long 0x1231A0C0 + .long 0x12816888 + .long 0x1252A0C0 + .long 0x12826888 + .long 0x1273A0C0 + + vand 21,21,29 + vand 26,26,29 + vand 22,22,29 + vand 23,23,29 + + + + + + vspltisb 20,2 + .long 0x1092CEC4 + .long 0x102FCEC4 + vand 3,18,29 + vand 0,15,29 + .long 0x108498C0 + .long 0x102180C0 + + .long 0x11384E8C + .long 0x10B52E8C + .long 0x10DA368C + .long 0x10F63E8C + .long 0x1117468C + vor 9,9,30 + + .long 0x1264CEC4 + .long 0x1201CEC4 + vand 4,4,29 + vand 1,1,29 + .long 0x100098C0 + .long 0x105180C0 + + .long 0x1273A5C4 + .long 0x1222CEC4 + vand 2,2,29 + .long 0x100098C0 + .long 0x106388C0 + + .long 0x11E0CEC4 + .long 0x1243CEC4 + vand 0,0,29 + vand 3,3,29 + .long 0x102178C0 + .long 0x108490C0 + + addi 4,4,0x40 + bdnz .Loop_vsx + + neg 5,5 + andi. 5,5,0x30 + sub 4,4,5 + + .long 0x7D5D1E99 + .long 0x7D605699 + .long 0x7D9B5699 + .long 0x7DBC5699 + .long 0x7DDD5699 + +.Last_vsx: + .long 0x11E55288 + .long 0x12065288 + .long 0x12275288 + .long 0x12485288 + .long 0x12695288 + + .long 0x12896288 + .long 0x11EFA0C0 + .long 0x12855A88 + .long 0x1210A0C0 + .long 0x12865A88 + .long 0x1231A0C0 + .long 0x12875A88 + .long 0x1252A0C0 + .long 0x7D9F5699 + .long 0x12885A88 + .long 0x1273A0C0 + .long 0x7D7E5699 + + .long 0x104238C0 + .long 0x100028C0 + .long 0x106340C0 + .long 0x102130C0 + .long 0x108448C0 + + .long 0x12887288 + .long 0x11EFA0C0 + .long 0x12897288 + .long 0x1210A0C0 + .long 0x12856A88 + .long 0x1231A0C0 + .long 0x12866A88 + .long 0x1252A0C0 + .long 0x7DC85699 + .long 0x12876A88 + .long 0x1273A0C0 + .long 0x7DA75699 + + .long 0x12876288 + .long 0x11EFA0C0 + .long 0x12886288 + .long 0x1210A0C0 + .long 0x12896288 + .long 0x1231A0C0 + .long 0x12855A88 + .long 0x1252A0C0 + .long 0x12865A88 + .long 0x1273A0C0 + + .long 0x12867288 + .long 0x11EFA0C0 + .long 0x12877288 + .long 0x1210A0C0 + .long 0x12887288 + .long 0x1231A0C0 + .long 0x12897288 + .long 0x1252A0C0 + .long 0x12856A88 + .long 0x1273A0C0 + + + .long 0x12805088 + .long 0x11EFA0C0 + .long 0x12815088 + .long 0x1210A0C0 + .long 0x12825088 + .long 0x1231A0C0 + .long 0x12835088 + .long 0x1252A0C0 + .long 0x12845088 + .long 0x1273A0C0 + + .long 0x12826088 + .long 0x11EFA0C0 + .long 0x12836088 + .long 0x1210A0C0 + .long 0x12846088 + .long 0x1231A0C0 + .long 0x12805888 + .long 0x1252A0C0 + .long 0x7D9B5699 + .long 0x12815888 + .long 0x1273A0C0 + .long 0x7D605699 + + .long 0x12817088 + .long 0x11EFA0C0 + .long 0x12827088 + .long 0x1210A0C0 + .long 0x12837088 + .long 0x1231A0C0 + .long 0x12847088 + .long 0x1252A0C0 + .long 0x7DDD5699 + .long 0x12806888 + .long 0x1273A0C0 + .long 0x7DBC5699 + + .long 0x12846088 + .long 0x11EFA0C0 + .long 0x12805888 + .long 0x1210A0C0 + .long 0x12815888 + .long 0x1231A0C0 + .long 0x12825888 + .long 0x1252A0C0 + .long 0x12835888 + .long 0x1273A0C0 + + .long 0x12837088 + .long 0x11EFA0C0 + .long 0x12847088 + .long 0x1210A0C0 + .long 0x12806888 + .long 0x1231A0C0 + .long 0x12816888 + .long 0x1252A0C0 + .long 0x12826888 + .long 0x1273A0C0 + + + + + .long 0xF00F7A57 + .long 0xF0308257 + .long 0xF0518A57 + .long 0xF0729257 + .long 0xF0939A57 + .long 0x11EF00C0 + .long 0x121008C0 + .long 0x123110C0 + .long 0x125218C0 + .long 0x127320C0 + + + + + vspltisb 20,2 + .long 0x1092CEC4 + .long 0x102FCEC4 + vand 3,18,29 + vand 0,15,29 + .long 0x108498C0 + .long 0x102180C0 + + .long 0x1264CEC4 + .long 0x1201CEC4 + vand 4,4,29 + vand 1,1,29 + .long 0x100098C0 + .long 0x105180C0 + + .long 0x1273A5C4 + .long 0x1222CEC4 + vand 2,2,29 + .long 0x100098C0 + .long 0x106388C0 + + .long 0x11E0CEC4 + .long 0x1243CEC4 + vand 0,0,29 + vand 3,3,29 + .long 0x102178C0 + .long 0x108490C0 + + beq .Ldone_vsx + + add 6,12,5 + + + .long 0x7EA02699 + .long 0x7EDB2699 + .long 0x7EFC2699 + .long 0x7F1D2699 + + + + + + .long 0xF0B5B057 + vspltisb 26,4 + vperm 7,21,22,31 + .long 0xF115B357 + + .long 0x10C5CEC4 + .long 0x10E7D6C4 + .long 0x1128DEC4 + .long 0x1108E6C4 + vand 5,5,29 + vand 6,6,29 + vand 7,7,29 + vand 8,8,29 + + .long 0xF297C057 + vperm 21,23,24,31 + .long 0xF2D7C357 + + .long 0x7DE03699 + .long 0x7E1D3699 + + .long 0x12F4CEC4 + .long 0x12B5D6C4 + .long 0x1316DEC4 + .long 0x12D6E6C4 + vand 20,20,29 + vand 23,23,29 + vand 21,21,29 + vand 22,22,29 + + + .long 0x11384E8C + .long 0x10B42E8C + .long 0x10D7368C + .long 0x10F53E8C + .long 0x1116468C + vor 9,9,30 + + vperm 0,0,0,15 + vand 5,5, 16 + vperm 1,1,1,15 + vand 6,6, 16 + vperm 2,2,2,15 + vand 7,7, 16 + vperm 3,3,3,15 + vand 8,8, 16 + vperm 4,4,4,15 + vand 9,9, 16 + + .long 0x10A500C0 + vxor 0,0,0 + .long 0x10C608C0 + vxor 1,1,1 + .long 0x10E710C0 + vxor 2,2,2 + .long 0x110818C0 + vxor 3,3,3 + .long 0x112920C0 + vxor 4,4,4 + + xor. 5,5,5 + b .Last_vsx + +.align 4 +.Ldone_vsx: + ld 0,448(1) + li 27,4 + li 28,8 + li 29,12 + li 30,16 + .long 0x7C001919 + .long 0x7C3B1919 + .long 0x7C5C1919 + .long 0x7C7D1919 + .long 0x7C9E1919 + + lwz 12,388(1) + mtlr 0 + li 10,191 + li 11,207 + or 12,12,12 + lvx 20,10,1 + addi 10,10,32 + lvx 21,10,1 + addi 10,10,32 + lvx 22,11,1 + addi 11,11,32 + lvx 23,10,1 + addi 10,10,32 + lvx 24,11,1 + addi 11,11,32 + lvx 25,10,1 + addi 10,10,32 + lvx 26,11,1 + addi 11,11,32 + lvx 27,10,1 + addi 10,10,32 + lvx 28,11,1 + addi 11,11,32 + lvx 29,10,1 + addi 10,10,32 + lvx 30,11,1 + lvx 31,10,1 + ld 27,392(1) + ld 28,400(1) + ld 29,408(1) + ld 30,416(1) + ld 31,424(1) + addi 1,1,432 + blr +.long 0 +.byte 0,12,0x04,1,0x80,5,4,0 +.long 0 +.size __poly1305_blocks_vsx,.-__poly1305_blocks_vsx + +.align 6 +.LPICmeup: + mflr 0 + bcl 20,31,$+4 + mflr 12 + addi 12,12,56 + mtlr 0 + blr +.long 0 +.byte 0,12,0x14,0,0,0,0,0 +.space 28 + +.long 0x03ffffff,0x00000000 +.long 0x03ffffff,0x00000000 +.long 0x0000001a,0x00000000 +.long 0x0000001a,0x00000000 +.long 0x00000028,0x00000000 +.long 0x00000028,0x00000000 +.long 0x0e0f0001,0x00000000 +.long 0x1e1f1011,0x00000000 +.long 0x01000000,0x01000000 +.long 0x01000000,0x01000000 +.long 0x03020100,0x07060504 +.long 0x0b0a0908,0x0f0e0d0c + +.long 0x00000000,0x00000000 +.long 0x04050607,0x00000000 +.long 0x00000000,0x04050607 +.long 0x00000000,0x00000000 +.long 0x00000000,0x00000000 +.long 0x00000000,0x04050607 + +.long 0x00000000,0xffffffff +.long 0xffffffff,0xffffffff +.long 0x00000000,0xffffffff +.long 0x00000000,0xffffffff +.long 0x00000000,0x00000000 +.long 0x00000000,0xffffffff +.byte 80,111,108,121,49,51,48,53,32,102,111,114,32,80,80,67,44,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0 .align 2 diff --git a/sys/crypto/openssl/powerpc64le/vpaes-ppc.S b/sys/crypto/openssl/powerpc64le/vpaes-ppc.S --- a/sys/crypto/openssl/powerpc64le/vpaes-ppc.S +++ b/sys/crypto/openssl/powerpc64le/vpaes-ppc.S @@ -674,7 +674,7 @@ vor 24,0,0 sub. 30, 30, 0 vperm 0, 0, 0, 29 - vsel 1, 28, 0, 30 + vsel 1,28,0,30 vor 28,0,0 stvx 1, 0, 4 addi 4, 4, 16 @@ -726,7 +726,7 @@ vor 24,25,25 sub. 30, 30, 0 vperm 0, 0, 0, 29 - vsel 1, 28, 0, 30 + vsel 1,28,0,30 vor 28,0,0 stvx 1, 0, 4 addi 4, 4, 16 @@ -1044,7 +1044,7 @@ vperm 0, 0, 0, 29 li 10, 4 - vsel 2, 28, 0, 30 + vsel 2,28,0,30 li 11, 8 stvx 2, 0, 5 li 12, 12 @@ -1066,7 +1066,7 @@ addi 9, 5, -15 vperm 0, 0, 0, 29 li 10, 4 - vsel 2, 28, 0, 30 + vsel 2,28,0,30 li 11, 8 stvx 2, 0, 5 li 12, 12 @@ -1157,7 +1157,7 @@ vsldoi 1, 7, 9, 16-12 vxor 7, 7, 1 - vspltisb 1, 0x0f + vspltisb 1,0x0f vsldoi 4, 7, 9, 16-8 @@ -1253,7 +1253,7 @@ vperm 1, 3, 3, 29 - vsel 2, 28, 1, 30 + vsel 2,28,1,30 vor 28,1,1 stvx 2, 0, 5 blr @@ -1304,7 +1304,7 @@ vperm 1, 3, 3, 29 - vsel 2, 28, 1, 30 + vsel 2,28,1,30 vor 28,1,1 stvx 2, 0, 5 blr