Index: head/sys/crypto/aesni/aesni_ghash.c =================================================================== --- head/sys/crypto/aesni/aesni_ghash.c (revision 322599) +++ head/sys/crypto/aesni/aesni_ghash.c (revision 322600) @@ -1,804 +1,808 @@ /*- * Copyright (c) 2014 The FreeBSD Foundation * All rights reserved. * * This software was developed by John-Mark Gurney under * the sponsorship of the FreeBSD Foundation and * Rubicon Communications, LLC (Netgate). * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * * $FreeBSD$ * */ /* * Figure 5, 8 and 12 are copied from the Intel white paper: * Intel® Carry-Less Multiplication Instruction and its Usage for * Computing the GCM Mode * * and as such are: * Copyright © 2010 Intel Corporation. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of Intel Corporation nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef _KERNEL #include #include #else #include #endif #include #include #include static inline int m128icmp(__m128i a, __m128i b) { __m128i cmp; cmp = _mm_cmpeq_epi32(a, b); return _mm_movemask_epi8(cmp) == 0xffff; } #ifdef __i386__ static inline __m128i _mm_insert_epi64(__m128i a, int64_t b, const int ndx) { if (!ndx) { a = _mm_insert_epi32(a, b, 0); a = _mm_insert_epi32(a, b >> 32, 1); } else { a = _mm_insert_epi32(a, b, 2); a = _mm_insert_epi32(a, b >> 32, 3); } return a; } #endif /* some code from carry-less-multiplication-instruction-in-gcm-mode-paper.pdf */ /* Figure 5. Code Sample - Performing Ghash Using Algorithms 1 and 5 (C) */ static void gfmul(__m128i a, __m128i b, __m128i *res) { __m128i tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9; tmp3 = _mm_clmulepi64_si128(a, b, 0x00); tmp4 = _mm_clmulepi64_si128(a, b, 0x10); tmp5 = _mm_clmulepi64_si128(a, b, 0x01); tmp6 = _mm_clmulepi64_si128(a, b, 0x11); tmp4 = _mm_xor_si128(tmp4, tmp5); tmp5 = _mm_slli_si128(tmp4, 8); tmp4 = _mm_srli_si128(tmp4, 8); tmp3 = _mm_xor_si128(tmp3, tmp5); tmp6 = _mm_xor_si128(tmp6, tmp4); tmp7 = _mm_srli_epi32(tmp3, 31); tmp8 = _mm_srli_epi32(tmp6, 31); tmp3 = _mm_slli_epi32(tmp3, 1); tmp6 = _mm_slli_epi32(tmp6, 1); tmp9 = _mm_srli_si128(tmp7, 12); tmp8 = _mm_slli_si128(tmp8, 4); tmp7 = _mm_slli_si128(tmp7, 4); tmp3 = _mm_or_si128(tmp3, tmp7); tmp6 = _mm_or_si128(tmp6, tmp8); tmp6 = _mm_or_si128(tmp6, tmp9); tmp7 = _mm_slli_epi32(tmp3, 31); tmp8 = _mm_slli_epi32(tmp3, 30); tmp9 = _mm_slli_epi32(tmp3, 25); tmp7 = _mm_xor_si128(tmp7, tmp8); tmp7 = _mm_xor_si128(tmp7, tmp9); tmp8 = _mm_srli_si128(tmp7, 4); tmp7 = _mm_slli_si128(tmp7, 12); tmp3 = _mm_xor_si128(tmp3, tmp7); tmp2 = _mm_srli_epi32(tmp3, 1); tmp4 = _mm_srli_epi32(tmp3, 2); tmp5 = _mm_srli_epi32(tmp3, 7); tmp2 = _mm_xor_si128(tmp2, tmp4); tmp2 = _mm_xor_si128(tmp2, tmp5); tmp2 = _mm_xor_si128(tmp2, tmp8); tmp3 = _mm_xor_si128(tmp3, tmp2); tmp6 = _mm_xor_si128(tmp6, tmp3); *res = tmp6; } /* * Figure 8. Code Sample - Performing Ghash Using an Aggregated Reduction * Method */ static void reduce4(__m128i H1, __m128i H2, __m128i H3, __m128i H4, __m128i X1, __m128i X2, __m128i X3, __m128i X4, __m128i *res) { /*algorithm by Krzysztof Jankowski, Pierre Laurent - Intel*/ __m128i H1_X1_lo, H1_X1_hi, H2_X2_lo, H2_X2_hi, H3_X3_lo, H3_X3_hi, H4_X4_lo, H4_X4_hi, lo, hi; __m128i tmp0, tmp1, tmp2, tmp3; __m128i tmp4, tmp5, tmp6, tmp7; __m128i tmp8, tmp9; H1_X1_lo = _mm_clmulepi64_si128(H1, X1, 0x00); H2_X2_lo = _mm_clmulepi64_si128(H2, X2, 0x00); H3_X3_lo = _mm_clmulepi64_si128(H3, X3, 0x00); H4_X4_lo = _mm_clmulepi64_si128(H4, X4, 0x00); lo = _mm_xor_si128(H1_X1_lo, H2_X2_lo); lo = _mm_xor_si128(lo, H3_X3_lo); lo = _mm_xor_si128(lo, H4_X4_lo); H1_X1_hi = _mm_clmulepi64_si128(H1, X1, 0x11); H2_X2_hi = _mm_clmulepi64_si128(H2, X2, 0x11); H3_X3_hi = _mm_clmulepi64_si128(H3, X3, 0x11); H4_X4_hi = _mm_clmulepi64_si128(H4, X4, 0x11); hi = _mm_xor_si128(H1_X1_hi, H2_X2_hi); hi = _mm_xor_si128(hi, H3_X3_hi); hi = _mm_xor_si128(hi, H4_X4_hi); tmp0 = _mm_shuffle_epi32(H1, 78); tmp4 = _mm_shuffle_epi32(X1, 78); tmp0 = _mm_xor_si128(tmp0, H1); tmp4 = _mm_xor_si128(tmp4, X1); tmp1 = _mm_shuffle_epi32(H2, 78); tmp5 = _mm_shuffle_epi32(X2, 78); tmp1 = _mm_xor_si128(tmp1, H2); tmp5 = _mm_xor_si128(tmp5, X2); tmp2 = _mm_shuffle_epi32(H3, 78); tmp6 = _mm_shuffle_epi32(X3, 78); tmp2 = _mm_xor_si128(tmp2, H3); tmp6 = _mm_xor_si128(tmp6, X3); tmp3 = _mm_shuffle_epi32(H4, 78); tmp7 = _mm_shuffle_epi32(X4, 78); tmp3 = _mm_xor_si128(tmp3, H4); tmp7 = _mm_xor_si128(tmp7, X4); tmp0 = _mm_clmulepi64_si128(tmp0, tmp4, 0x00); tmp1 = _mm_clmulepi64_si128(tmp1, tmp5, 0x00); tmp2 = _mm_clmulepi64_si128(tmp2, tmp6, 0x00); tmp3 = _mm_clmulepi64_si128(tmp3, tmp7, 0x00); tmp0 = _mm_xor_si128(tmp0, lo); tmp0 = _mm_xor_si128(tmp0, hi); tmp0 = _mm_xor_si128(tmp1, tmp0); tmp0 = _mm_xor_si128(tmp2, tmp0); tmp0 = _mm_xor_si128(tmp3, tmp0); tmp4 = _mm_slli_si128(tmp0, 8); tmp0 = _mm_srli_si128(tmp0, 8); lo = _mm_xor_si128(tmp4, lo); hi = _mm_xor_si128(tmp0, hi); tmp3 = lo; tmp6 = hi; tmp7 = _mm_srli_epi32(tmp3, 31); tmp8 = _mm_srli_epi32(tmp6, 31); tmp3 = _mm_slli_epi32(tmp3, 1); tmp6 = _mm_slli_epi32(tmp6, 1); tmp9 = _mm_srli_si128(tmp7, 12); tmp8 = _mm_slli_si128(tmp8, 4); tmp7 = _mm_slli_si128(tmp7, 4); tmp3 = _mm_or_si128(tmp3, tmp7); tmp6 = _mm_or_si128(tmp6, tmp8); tmp6 = _mm_or_si128(tmp6, tmp9); tmp7 = _mm_slli_epi32(tmp3, 31); tmp8 = _mm_slli_epi32(tmp3, 30); tmp9 = _mm_slli_epi32(tmp3, 25); tmp7 = _mm_xor_si128(tmp7, tmp8); tmp7 = _mm_xor_si128(tmp7, tmp9); tmp8 = _mm_srli_si128(tmp7, 4); tmp7 = _mm_slli_si128(tmp7, 12); tmp3 = _mm_xor_si128(tmp3, tmp7); tmp2 = _mm_srli_epi32(tmp3, 1); tmp4 = _mm_srli_epi32(tmp3, 2); tmp5 = _mm_srli_epi32(tmp3, 7); tmp2 = _mm_xor_si128(tmp2, tmp4); tmp2 = _mm_xor_si128(tmp2, tmp5); tmp2 = _mm_xor_si128(tmp2, tmp8); tmp3 = _mm_xor_si128(tmp3, tmp2); tmp6 = _mm_xor_si128(tmp6, tmp3); *res = tmp6; } /* * Figure 12. AES-GCM: Processing Four Blocks in Parallel with Aggregated * Every Four Blocks */ /* * per NIST SP-800-38D, 5.2.1.1, len(p) <= 2^39-256 (in bits), or * 2^32-256*8*16 bytes. */ void AES_GCM_encrypt(const unsigned char *in, unsigned char *out, const unsigned char *addt, const unsigned char *ivec, unsigned char *tag, uint32_t nbytes, uint32_t abytes, int ibytes, const unsigned char *key, int nr) { int i, j ,k; __m128i tmp1, tmp2, tmp3, tmp4; __m128i tmp5, tmp6, tmp7, tmp8; __m128i H, H2, H3, H4, Y, T; - __m128i *KEY = (__m128i*)key; + const __m128i *KEY = (const __m128i *)key; __m128i ctr1, ctr2, ctr3, ctr4; __m128i ctr5, ctr6, ctr7, ctr8; __m128i last_block = _mm_setzero_si128(); __m128i ONE = _mm_set_epi32(0, 1, 0, 0); __m128i EIGHT = _mm_set_epi32(0, 8, 0, 0); __m128i BSWAP_EPI64 = _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6, 7); __m128i BSWAP_MASK = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14, 15); __m128i X = _mm_setzero_si128(); if (ibytes == 96/8) { - Y = _mm_loadu_si128((__m128i*)ivec); + Y = _mm_loadu_si128((const __m128i *)ivec); Y = _mm_insert_epi32(Y, 0x1000000, 3); /*(Compute E[ZERO, KS] and E[Y0, KS] together*/ tmp1 = _mm_xor_si128(X, KEY[0]); tmp2 = _mm_xor_si128(Y, KEY[0]); for (j=1; j < nr-1; j+=2) { tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); tmp2 = _mm_aesenc_si128(tmp2, KEY[j]); tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]); tmp2 = _mm_aesenc_si128(tmp2, KEY[j+1]); } tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]); tmp2 = _mm_aesenc_si128(tmp2, KEY[nr-1]); H = _mm_aesenclast_si128(tmp1, KEY[nr]); T = _mm_aesenclast_si128(tmp2, KEY[nr]); H = _mm_shuffle_epi8(H, BSWAP_MASK); } else { tmp1 = _mm_xor_si128(X, KEY[0]); for (j=1; j * Copyright (c) 2010 Konstantin Belousov * Copyright (c) 2010-2011 Pawel Jakub Dawidek * Copyright 2012-2013 John-Mark Gurney * Copyright (c) 2014 The FreeBSD Foundation * All rights reserved. * * Portions of this software were developed by John-Mark Gurney * under sponsorship of the FreeBSD Foundation and * Rubicon Communications, LLC (Netgate). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include "aesencdec.h" #include MALLOC_DECLARE(M_AESNI); struct blocks8 { __m128i blk[8]; } __packed; void aesni_encrypt_cbc(int rounds, const void *key_schedule, size_t len, const uint8_t *from, uint8_t *to, const uint8_t iv[static AES_BLOCK_LEN]) { __m128i tot, ivreg; size_t i; len /= AES_BLOCK_LEN; ivreg = _mm_loadu_si128((const __m128i *)iv); for (i = 0; i < len; i++) { tot = aesni_enc(rounds - 1, key_schedule, _mm_loadu_si128((const __m128i *)from) ^ ivreg); ivreg = tot; _mm_storeu_si128((__m128i *)to, tot); from += AES_BLOCK_LEN; to += AES_BLOCK_LEN; } } void aesni_decrypt_cbc(int rounds, const void *key_schedule, size_t len, uint8_t *buf, const uint8_t iv[static AES_BLOCK_LEN]) { __m128i blocks[8]; struct blocks8 *blks; __m128i ivreg, nextiv; size_t i, j, cnt; ivreg = _mm_loadu_si128((const __m128i *)iv); cnt = len / AES_BLOCK_LEN / 8; for (i = 0; i < cnt; i++) { blks = (struct blocks8 *)buf; aesni_dec8(rounds - 1, key_schedule, blks->blk[0], blks->blk[1], blks->blk[2], blks->blk[3], blks->blk[4], blks->blk[5], blks->blk[6], blks->blk[7], &blocks[0]); for (j = 0; j < 8; j++) { nextiv = blks->blk[j]; blks->blk[j] = blocks[j] ^ ivreg; ivreg = nextiv; } buf += AES_BLOCK_LEN * 8; } i *= 8; cnt = len / AES_BLOCK_LEN; for (; i < cnt; i++) { nextiv = _mm_loadu_si128((void *)buf); _mm_storeu_si128((void *)buf, aesni_dec(rounds - 1, key_schedule, nextiv) ^ ivreg); ivreg = nextiv; buf += AES_BLOCK_LEN; } } void aesni_encrypt_ecb(int rounds, const void *key_schedule, size_t len, const uint8_t *from, uint8_t *to) { __m128i tot; __m128i tout[8]; struct blocks8 *top; const struct blocks8 *blks; size_t i, cnt; cnt = len / AES_BLOCK_LEN / 8; for (i = 0; i < cnt; i++) { blks = (const struct blocks8 *)from; top = (struct blocks8 *)to; aesni_enc8(rounds - 1, key_schedule, blks->blk[0], blks->blk[1], blks->blk[2], blks->blk[3], blks->blk[4], blks->blk[5], blks->blk[6], blks->blk[7], tout); top->blk[0] = tout[0]; top->blk[1] = tout[1]; top->blk[2] = tout[2]; top->blk[3] = tout[3]; top->blk[4] = tout[4]; top->blk[5] = tout[5]; top->blk[6] = tout[6]; top->blk[7] = tout[7]; from += AES_BLOCK_LEN * 8; to += AES_BLOCK_LEN * 8; } i *= 8; cnt = len / AES_BLOCK_LEN; for (; i < cnt; i++) { tot = aesni_enc(rounds - 1, key_schedule, _mm_loadu_si128((const __m128i *)from)); _mm_storeu_si128((__m128i *)to, tot); from += AES_BLOCK_LEN; to += AES_BLOCK_LEN; } } void aesni_decrypt_ecb(int rounds, const void *key_schedule, size_t len, const uint8_t from[AES_BLOCK_LEN], uint8_t to[AES_BLOCK_LEN]) { __m128i tot; __m128i tout[8]; const struct blocks8 *blks; struct blocks8 *top; size_t i, cnt; cnt = len / AES_BLOCK_LEN / 8; for (i = 0; i < cnt; i++) { blks = (const struct blocks8 *)from; top = (struct blocks8 *)to; aesni_dec8(rounds - 1, key_schedule, blks->blk[0], blks->blk[1], blks->blk[2], blks->blk[3], blks->blk[4], blks->blk[5], blks->blk[6], blks->blk[7], tout); top->blk[0] = tout[0]; top->blk[1] = tout[1]; top->blk[2] = tout[2]; top->blk[3] = tout[3]; top->blk[4] = tout[4]; top->blk[5] = tout[5]; top->blk[6] = tout[6]; top->blk[7] = tout[7]; from += AES_BLOCK_LEN * 8; to += AES_BLOCK_LEN * 8; } i *= 8; cnt = len / AES_BLOCK_LEN; for (; i < cnt; i++) { tot = aesni_dec(rounds - 1, key_schedule, _mm_loadu_si128((const __m128i *)from)); _mm_storeu_si128((__m128i *)to, tot); from += AES_BLOCK_LEN; to += AES_BLOCK_LEN; } } /* * mixed endian increment, low 64bits stored in hi word to be compatible * with _icm's BSWAP. */ static inline __m128i nextc(__m128i x) { const __m128i ONE = _mm_setr_epi32(0, 0, 1, 0); const __m128i ZERO = _mm_setzero_si128(); x = _mm_add_epi64(x, ONE); __m128i t = _mm_cmpeq_epi64(x, ZERO); t = _mm_unpackhi_epi64(t, ZERO); x = _mm_sub_epi64(x, t); return x; } void aesni_encrypt_icm(int rounds, const void *key_schedule, size_t len, const uint8_t *from, uint8_t *to, const uint8_t iv[static AES_BLOCK_LEN]) { __m128i tot; __m128i tmp1, tmp2, tmp3, tmp4; __m128i tmp5, tmp6, tmp7, tmp8; __m128i ctr1, ctr2, ctr3, ctr4; __m128i ctr5, ctr6, ctr7, ctr8; __m128i BSWAP_EPI64; __m128i tout[8]; struct blocks8 *top; const struct blocks8 *blks; size_t i, cnt; BSWAP_EPI64 = _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7); - ctr1 = _mm_loadu_si128((__m128i*)iv); + ctr1 = _mm_loadu_si128((const __m128i *)iv); ctr1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); cnt = len / AES_BLOCK_LEN / 8; for (i = 0; i < cnt; i++) { tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); ctr2 = nextc(ctr1); tmp2 = _mm_shuffle_epi8(ctr2, BSWAP_EPI64); ctr3 = nextc(ctr2); tmp3 = _mm_shuffle_epi8(ctr3, BSWAP_EPI64); ctr4 = nextc(ctr3); tmp4 = _mm_shuffle_epi8(ctr4, BSWAP_EPI64); ctr5 = nextc(ctr4); tmp5 = _mm_shuffle_epi8(ctr5, BSWAP_EPI64); ctr6 = nextc(ctr5); tmp6 = _mm_shuffle_epi8(ctr6, BSWAP_EPI64); ctr7 = nextc(ctr6); tmp7 = _mm_shuffle_epi8(ctr7, BSWAP_EPI64); ctr8 = nextc(ctr7); tmp8 = _mm_shuffle_epi8(ctr8, BSWAP_EPI64); ctr1 = nextc(ctr8); blks = (const struct blocks8 *)from; top = (struct blocks8 *)to; aesni_enc8(rounds - 1, key_schedule, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tout); top->blk[0] = blks->blk[0] ^ tout[0]; top->blk[1] = blks->blk[1] ^ tout[1]; top->blk[2] = blks->blk[2] ^ tout[2]; top->blk[3] = blks->blk[3] ^ tout[3]; top->blk[4] = blks->blk[4] ^ tout[4]; top->blk[5] = blks->blk[5] ^ tout[5]; top->blk[6] = blks->blk[6] ^ tout[6]; top->blk[7] = blks->blk[7] ^ tout[7]; from += AES_BLOCK_LEN * 8; to += AES_BLOCK_LEN * 8; } i *= 8; cnt = len / AES_BLOCK_LEN; for (; i < cnt; i++) { tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); ctr1 = nextc(ctr1); tot = aesni_enc(rounds - 1, key_schedule, tmp1); tot = tot ^ _mm_loadu_si128((const __m128i *)from); _mm_storeu_si128((__m128i *)to, tot); from += AES_BLOCK_LEN; to += AES_BLOCK_LEN; } /* handle remaining partial round */ if (len % AES_BLOCK_LEN != 0) { tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); tot = aesni_enc(rounds - 1, key_schedule, tmp1); tot = tot ^ _mm_loadu_si128((const __m128i *)from); memcpy(to, &tot, len % AES_BLOCK_LEN); } } #define AES_XTS_BLOCKSIZE 16 #define AES_XTS_IVSIZE 8 #define AES_XTS_ALPHA 0x87 /* GF(2^128) generator polynomial */ static inline __m128i xts_crank_lfsr(__m128i inp) { const __m128i alphamask = _mm_set_epi32(1, 1, 1, AES_XTS_ALPHA); __m128i xtweak, ret; /* set up xor mask */ xtweak = _mm_shuffle_epi32(inp, 0x93); xtweak = _mm_srai_epi32(xtweak, 31); xtweak &= alphamask; /* next term */ ret = _mm_slli_epi32(inp, 1); ret ^= xtweak; return ret; } static void aesni_crypt_xts_block(int rounds, const __m128i *key_schedule, __m128i *tweak, const uint8_t *from, uint8_t *to, int do_encrypt) { __m128i block; block = _mm_loadu_si128((const __m128i *)from) ^ *tweak; if (do_encrypt) block = aesni_enc(rounds - 1, key_schedule, block); else block = aesni_dec(rounds - 1, key_schedule, block); _mm_storeu_si128((__m128i *)to, block ^ *tweak); *tweak = xts_crank_lfsr(*tweak); } static void aesni_crypt_xts_block8(int rounds, const __m128i *key_schedule, __m128i *tweak, const uint8_t *from, uint8_t *to, int do_encrypt) { __m128i tmptweak; __m128i a, b, c, d, e, f, g, h; __m128i tweaks[8]; __m128i tmp[8]; __m128i *top; const __m128i *fromp; tmptweak = *tweak; /* * unroll the loop. This lets gcc put values directly in the * register and saves memory accesses. */ fromp = (const __m128i *)from; #define PREPINP(v, pos) \ do { \ tweaks[(pos)] = tmptweak; \ (v) = _mm_loadu_si128(&fromp[pos]) ^ \ tmptweak; \ tmptweak = xts_crank_lfsr(tmptweak); \ } while (0) PREPINP(a, 0); PREPINP(b, 1); PREPINP(c, 2); PREPINP(d, 3); PREPINP(e, 4); PREPINP(f, 5); PREPINP(g, 6); PREPINP(h, 7); *tweak = tmptweak; if (do_encrypt) aesni_enc8(rounds - 1, key_schedule, a, b, c, d, e, f, g, h, tmp); else aesni_dec8(rounds - 1, key_schedule, a, b, c, d, e, f, g, h, tmp); top = (__m128i *)to; _mm_storeu_si128(&top[0], tmp[0] ^ tweaks[0]); _mm_storeu_si128(&top[1], tmp[1] ^ tweaks[1]); _mm_storeu_si128(&top[2], tmp[2] ^ tweaks[2]); _mm_storeu_si128(&top[3], tmp[3] ^ tweaks[3]); _mm_storeu_si128(&top[4], tmp[4] ^ tweaks[4]); _mm_storeu_si128(&top[5], tmp[5] ^ tweaks[5]); _mm_storeu_si128(&top[6], tmp[6] ^ tweaks[6]); _mm_storeu_si128(&top[7], tmp[7] ^ tweaks[7]); } static void aesni_crypt_xts(int rounds, const __m128i *data_schedule, const __m128i *tweak_schedule, size_t len, const uint8_t *from, uint8_t *to, const uint8_t iv[static AES_BLOCK_LEN], int do_encrypt) { __m128i tweakreg; uint8_t tweak[AES_XTS_BLOCKSIZE] __aligned(16); size_t i, cnt; /* * Prepare tweak as E_k2(IV). IV is specified as LE representation * of a 64-bit block number which we allow to be passed in directly. */ #if BYTE_ORDER == LITTLE_ENDIAN bcopy(iv, tweak, AES_XTS_IVSIZE); /* Last 64 bits of IV are always zero. */ bzero(tweak + AES_XTS_IVSIZE, AES_XTS_IVSIZE); #else #error Only LITTLE_ENDIAN architectures are supported. #endif tweakreg = _mm_loadu_si128((__m128i *)&tweak[0]); tweakreg = aesni_enc(rounds - 1, tweak_schedule, tweakreg); cnt = len / AES_XTS_BLOCKSIZE / 8; for (i = 0; i < cnt; i++) { aesni_crypt_xts_block8(rounds, data_schedule, &tweakreg, from, to, do_encrypt); from += AES_XTS_BLOCKSIZE * 8; to += AES_XTS_BLOCKSIZE * 8; } i *= 8; cnt = len / AES_XTS_BLOCKSIZE; for (; i < cnt; i++) { aesni_crypt_xts_block(rounds, data_schedule, &tweakreg, from, to, do_encrypt); from += AES_XTS_BLOCKSIZE; to += AES_XTS_BLOCKSIZE; } } void aesni_encrypt_xts(int rounds, const void *data_schedule, const void *tweak_schedule, size_t len, const uint8_t *from, uint8_t *to, const uint8_t iv[static AES_BLOCK_LEN]) { aesni_crypt_xts(rounds, data_schedule, tweak_schedule, len, from, to, iv, 1); } void aesni_decrypt_xts(int rounds, const void *data_schedule, const void *tweak_schedule, size_t len, const uint8_t *from, uint8_t *to, const uint8_t iv[static AES_BLOCK_LEN]) { aesni_crypt_xts(rounds, data_schedule, tweak_schedule, len, from, to, iv, 0); } int aesni_cipher_setup_common(struct aesni_session *ses, const uint8_t *key, int keylen) { int decsched; decsched = 1; switch (ses->algo) { case CRYPTO_AES_ICM: case CRYPTO_AES_NIST_GCM_16: decsched = 0; /* FALLTHROUGH */ case CRYPTO_AES_CBC: switch (keylen) { case 128: ses->rounds = AES128_ROUNDS; break; case 192: ses->rounds = AES192_ROUNDS; break; case 256: ses->rounds = AES256_ROUNDS; break; default: CRYPTDEB("invalid CBC/ICM/GCM key length"); return (EINVAL); } break; case CRYPTO_AES_XTS: switch (keylen) { case 256: ses->rounds = AES128_ROUNDS; break; case 512: ses->rounds = AES256_ROUNDS; break; default: CRYPTDEB("invalid XTS key length"); return (EINVAL); } break; default: return (EINVAL); } aesni_set_enckey(key, ses->enc_schedule, ses->rounds); if (decsched) aesni_set_deckey(ses->enc_schedule, ses->dec_schedule, ses->rounds); if (ses->algo == CRYPTO_AES_XTS) aesni_set_enckey(key + keylen / 16, ses->xts_schedule, ses->rounds); return (0); }