Changeset View
Changeset View
Standalone View
Standalone View
head/contrib/bearssl/src/hash/ghash_pwr8.c
Property | Old Value | New Value |
---|---|---|
svn:eol-style | null | native \ No newline at end of property |
svn:keywords | null | FreeBSD=%H \ No newline at end of property |
svn:mime-type | null | text/plain \ No newline at end of property |
/* | |||||
* Copyright (c) 2017 Thomas Pornin <pornin@bolet.org> | |||||
* | |||||
* Permission is hereby granted, free of charge, to any person obtaining | |||||
* a copy of this software and associated documentation files (the | |||||
* "Software"), to deal in the Software without restriction, including | |||||
* without limitation the rights to use, copy, modify, merge, publish, | |||||
* distribute, sublicense, and/or sell copies of the Software, and to | |||||
* permit persons to whom the Software is furnished to do so, subject to | |||||
* the following conditions: | |||||
* | |||||
* The above copyright notice and this permission notice shall be | |||||
* included in all copies or substantial portions of the Software. | |||||
* | |||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | |||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | |||||
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | |||||
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | |||||
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||||
* SOFTWARE. | |||||
*/ | |||||
#define BR_POWER_ASM_MACROS 1 | |||||
#include "inner.h" | |||||
/* | |||||
* This is the GHASH implementation that leverages the POWER8 opcodes. | |||||
*/ | |||||
#if BR_POWER8 | |||||
/* | |||||
* Some symbolic names for registers. | |||||
* HB0 = 16 bytes of value 0 | |||||
* HB1 = 16 bytes of value 1 | |||||
* HB2 = 16 bytes of value 2 | |||||
* HB6 = 16 bytes of value 6 | |||||
* HB7 = 16 bytes of value 7 | |||||
* TT0, TT1 and TT2 are temporaries | |||||
* | |||||
* BSW holds the pattern for byteswapping 32-bit words; this is set only | |||||
* on little-endian systems. XBSW is the same register with the +32 offset | |||||
* for access with the VSX opcodes. | |||||
*/ | |||||
#define HB0 0 | |||||
#define HB1 1 | |||||
#define HB2 2 | |||||
#define HB6 3 | |||||
#define HB7 4 | |||||
#define TT0 5 | |||||
#define TT1 6 | |||||
#define TT2 7 | |||||
#define BSW 8 | |||||
#define XBSW 40 | |||||
/* | |||||
* Macro to initialise the constants. | |||||
*/ | |||||
#define INIT \ | |||||
vxor(HB0, HB0, HB0) \ | |||||
vspltisb(HB1, 1) \ | |||||
vspltisb(HB2, 2) \ | |||||
vspltisb(HB6, 6) \ | |||||
vspltisb(HB7, 7) \ | |||||
INIT_BSW | |||||
/* | |||||
* Fix endianness of a value after reading it or before writing it, if | |||||
* necessary. | |||||
*/ | |||||
#if BR_POWER8_LE | |||||
#define INIT_BSW lxvw4x(XBSW, 0, %[idx2be]) | |||||
#define FIX_ENDIAN(xx) vperm(xx, xx, xx, BSW) | |||||
#else | |||||
#define INIT_BSW | |||||
#define FIX_ENDIAN(xx) | |||||
#endif | |||||
/* | |||||
* Left-shift x0:x1 by one bit to the left. This is a corrective action | |||||
* needed because GHASH is defined in full little-endian specification, | |||||
* while the opcodes use full big-endian convention, so the 255-bit product | |||||
* ends up one bit to the right. | |||||
*/ | |||||
#define SL_256(x0, x1) \ | |||||
vsldoi(TT0, HB0, x1, 1) \ | |||||
vsl(x0, x0, HB1) \ | |||||
vsr(TT0, TT0, HB7) \ | |||||
vsl(x1, x1, HB1) \ | |||||
vxor(x0, x0, TT0) | |||||
/* | |||||
* Reduce x0:x1 in GF(2^128), result in xd (register xd may be the same as | |||||
* x0 or x1, or a different register). x0 and x1 are modified. | |||||
*/ | |||||
#define REDUCE_F128(xd, x0, x1) \ | |||||
vxor(x0, x0, x1) \ | |||||
vsr(TT0, x1, HB1) \ | |||||
vsr(TT1, x1, HB2) \ | |||||
vsr(TT2, x1, HB7) \ | |||||
vxor(x0, x0, TT0) \ | |||||
vxor(TT1, TT1, TT2) \ | |||||
vxor(x0, x0, TT1) \ | |||||
vsldoi(x1, x1, HB0, 15) \ | |||||
vsl(TT1, x1, HB6) \ | |||||
vsl(TT2, x1, HB1) \ | |||||
vxor(x1, TT1, TT2) \ | |||||
vsr(TT0, x1, HB1) \ | |||||
vsr(TT1, x1, HB2) \ | |||||
vsr(TT2, x1, HB7) \ | |||||
vxor(x0, x0, x1) \ | |||||
vxor(x0, x0, TT0) \ | |||||
vxor(TT1, TT1, TT2) \ | |||||
vxor(xd, x0, TT1) | |||||
/* see bearssl_hash.h */ | |||||
void | |||||
br_ghash_pwr8(void *y, const void *h, const void *data, size_t len) | |||||
{ | |||||
const unsigned char *buf1, *buf2; | |||||
size_t num4, num1; | |||||
unsigned char tmp[64]; | |||||
long cc0, cc1, cc2, cc3; | |||||
#if BR_POWER8_LE | |||||
static const uint32_t idx2be[] = { | |||||
0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C | |||||
}; | |||||
#endif | |||||
buf1 = data; | |||||
/* | |||||
* Assembly code requires data into two chunks; first chunk | |||||
* must contain a number of blocks which is a multiple of 4. | |||||
* Since the processing for the first chunk is faster, we want | |||||
* to make it as big as possible. | |||||
* | |||||
* For the remainder, there are two possibilities: | |||||
* -- if the remainder size is a multiple of 16, then use it | |||||
* in place; | |||||
* -- otherwise, copy it to the tmp[] array and pad it with | |||||
* zeros. | |||||
*/ | |||||
num4 = len >> 6; | |||||
buf2 = buf1 + (num4 << 6); | |||||
len &= 63; | |||||
num1 = (len + 15) >> 4; | |||||
if ((len & 15) != 0) { | |||||
memcpy(tmp, buf2, len); | |||||
memset(tmp + len, 0, (num1 << 4) - len); | |||||
buf2 = tmp; | |||||
} | |||||
cc0 = 0; | |||||
cc1 = 16; | |||||
cc2 = 32; | |||||
cc3 = 48; | |||||
asm volatile ( | |||||
INIT | |||||
/* | |||||
* Load current h (denoted hereafter h1) in v9. | |||||
*/ | |||||
lxvw4x(41, 0, %[h]) | |||||
FIX_ENDIAN(9) | |||||
/* | |||||
* Load current y into v28. | |||||
*/ | |||||
lxvw4x(60, 0, %[y]) | |||||
FIX_ENDIAN(28) | |||||
/* | |||||
* Split h1 into three registers: | |||||
* v17 = h1_1:h1_0 | |||||
* v18 = 0:h1_0 | |||||
* v19 = h1_1:0 | |||||
*/ | |||||
xxpermdi(49, 41, 41, 2) | |||||
vsldoi(18, HB0, 9, 8) | |||||
vsldoi(19, 9, HB0, 8) | |||||
/* | |||||
* If num4 is 0, skip directly to the second chunk. | |||||
*/ | |||||
cmpldi(%[num4], 0) | |||||
beq(chunk1) | |||||
/* | |||||
* Compute h2 = h*h in v10. | |||||
*/ | |||||
vpmsumd(10, 18, 18) | |||||
vpmsumd(11, 19, 19) | |||||
SL_256(10, 11) | |||||
REDUCE_F128(10, 10, 11) | |||||
/* | |||||
* Compute h3 = h*h*h in v11. | |||||
* We first split h2 into: | |||||
* v10 = h2_0:h2_1 | |||||
* v11 = 0:h2_0 | |||||
* v12 = h2_1:0 | |||||
* Then we do the product with h1, and reduce into v11. | |||||
*/ | |||||
vsldoi(11, HB0, 10, 8) | |||||
vsldoi(12, 10, HB0, 8) | |||||
vpmsumd(13, 10, 17) | |||||
vpmsumd(11, 11, 18) | |||||
vpmsumd(12, 12, 19) | |||||
vsldoi(14, HB0, 13, 8) | |||||
vsldoi(15, 13, HB0, 8) | |||||
vxor(11, 11, 14) | |||||
vxor(12, 12, 15) | |||||
SL_256(11, 12) | |||||
REDUCE_F128(11, 11, 12) | |||||
/* | |||||
* Compute h4 = h*h*h*h in v12. This is done by squaring h2. | |||||
*/ | |||||
vsldoi(12, HB0, 10, 8) | |||||
vsldoi(13, 10, HB0, 8) | |||||
vpmsumd(12, 12, 12) | |||||
vpmsumd(13, 13, 13) | |||||
SL_256(12, 13) | |||||
REDUCE_F128(12, 12, 13) | |||||
/* | |||||
* Repack h1, h2, h3 and h4: | |||||
* v13 = h4_0:h3_0 | |||||
* v14 = h4_1:h3_1 | |||||
* v15 = h2_0:h1_0 | |||||
* v16 = h2_1:h1_1 | |||||
*/ | |||||
xxpermdi(45, 44, 43, 0) | |||||
xxpermdi(46, 44, 43, 3) | |||||
xxpermdi(47, 42, 41, 0) | |||||
xxpermdi(48, 42, 41, 3) | |||||
/* | |||||
* Loop for each group of four blocks. | |||||
*/ | |||||
mtctr(%[num4]) | |||||
label(loop4) | |||||
/* | |||||
* Read the four next blocks. | |||||
* v20 = y + a0 = b0 | |||||
* v21 = a1 = b1 | |||||
* v22 = a2 = b2 | |||||
* v23 = a3 = b3 | |||||
*/ | |||||
lxvw4x(52, %[cc0], %[buf1]) | |||||
lxvw4x(53, %[cc1], %[buf1]) | |||||
lxvw4x(54, %[cc2], %[buf1]) | |||||
lxvw4x(55, %[cc3], %[buf1]) | |||||
FIX_ENDIAN(20) | |||||
FIX_ENDIAN(21) | |||||
FIX_ENDIAN(22) | |||||
FIX_ENDIAN(23) | |||||
addi(%[buf1], %[buf1], 64) | |||||
vxor(20, 20, 28) | |||||
/* | |||||
* Repack the blocks into v9, v10, v11 and v12. | |||||
* v9 = b0_0:b1_0 | |||||
* v10 = b0_1:b1_1 | |||||
* v11 = b2_0:b3_0 | |||||
* v12 = b2_1:b3_1 | |||||
*/ | |||||
xxpermdi(41, 52, 53, 0) | |||||
xxpermdi(42, 52, 53, 3) | |||||
xxpermdi(43, 54, 55, 0) | |||||
xxpermdi(44, 54, 55, 3) | |||||
/* | |||||
* Compute the products. | |||||
* v20 = b0_0*h4_0 + b1_0*h3_0 | |||||
* v21 = b0_1*h4_0 + b1_1*h3_0 | |||||
* v22 = b0_0*h4_1 + b1_0*h3_1 | |||||
* v23 = b0_1*h4_1 + b1_1*h3_1 | |||||
* v24 = b2_0*h2_0 + b3_0*h1_0 | |||||
* v25 = b2_1*h2_0 + b3_1*h1_0 | |||||
* v26 = b2_0*h2_1 + b3_0*h1_1 | |||||
* v27 = b2_1*h2_1 + b3_1*h1_1 | |||||
*/ | |||||
vpmsumd(20, 13, 9) | |||||
vpmsumd(21, 13, 10) | |||||
vpmsumd(22, 14, 9) | |||||
vpmsumd(23, 14, 10) | |||||
vpmsumd(24, 15, 11) | |||||
vpmsumd(25, 15, 12) | |||||
vpmsumd(26, 16, 11) | |||||
vpmsumd(27, 16, 12) | |||||
/* | |||||
* Sum products into a single 256-bit result in v11:v12. | |||||
*/ | |||||
vxor(11, 20, 24) | |||||
vxor(12, 23, 27) | |||||
vxor( 9, 21, 22) | |||||
vxor(10, 25, 26) | |||||
vxor(20, 9, 10) | |||||
vsldoi( 9, HB0, 20, 8) | |||||
vsldoi(10, 20, HB0, 8) | |||||
vxor(11, 11, 9) | |||||
vxor(12, 12, 10) | |||||
/* | |||||
* Fix and reduce in GF(2^128); this is the new y (in v28). | |||||
*/ | |||||
SL_256(11, 12) | |||||
REDUCE_F128(28, 11, 12) | |||||
/* | |||||
* Loop for next group of four blocks. | |||||
*/ | |||||
bdnz(loop4) | |||||
/* | |||||
* Process second chunk, one block at a time. | |||||
*/ | |||||
label(chunk1) | |||||
cmpldi(%[num1], 0) | |||||
beq(done) | |||||
mtctr(%[num1]) | |||||
label(loop1) | |||||
/* | |||||
* Load next data block and XOR it into y. | |||||
*/ | |||||
lxvw4x(41, 0, %[buf2]) | |||||
#if BR_POWER8_LE | |||||
FIX_ENDIAN(9) | |||||
#endif | |||||
addi(%[buf2], %[buf2], 16) | |||||
vxor(9, 28, 9) | |||||
/* | |||||
* Split y into doublewords: | |||||
* v9 = y_0:y_1 | |||||
* v10 = 0:y_0 | |||||
* v11 = y_1:0 | |||||
*/ | |||||
vsldoi(10, HB0, 9, 8) | |||||
vsldoi(11, 9, HB0, 8) | |||||
/* | |||||
* Compute products with h: | |||||
* v12 = y_0 * h_0 | |||||
* v13 = y_1 * h_1 | |||||
* v14 = y_1 * h_0 + y_0 * h_1 | |||||
*/ | |||||
vpmsumd(14, 9, 17) | |||||
vpmsumd(12, 10, 18) | |||||
vpmsumd(13, 11, 19) | |||||
/* | |||||
* Propagate v14 into v12:v13 to finalise product. | |||||
*/ | |||||
vsldoi(10, HB0, 14, 8) | |||||
vsldoi(11, 14, HB0, 8) | |||||
vxor(12, 12, 10) | |||||
vxor(13, 13, 11) | |||||
/* | |||||
* Fix result and reduce into v28 (next value for y). | |||||
*/ | |||||
SL_256(12, 13) | |||||
REDUCE_F128(28, 12, 13) | |||||
bdnz(loop1) | |||||
label(done) | |||||
/* | |||||
* Write back the new y. | |||||
*/ | |||||
FIX_ENDIAN(28) | |||||
stxvw4x(60, 0, %[y]) | |||||
: [buf1] "+b" (buf1), [buf2] "+b" (buf2) | |||||
: [y] "b" (y), [h] "b" (h), [num4] "b" (num4), [num1] "b" (num1), | |||||
[cc0] "b" (cc0), [cc1] "b" (cc1), [cc2] "b" (cc2), [cc3] "b" (cc3) | |||||
#if BR_POWER8_LE | |||||
, [idx2be] "b" (idx2be) | |||||
#endif | |||||
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", | |||||
"v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", | |||||
"v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", | |||||
"ctr", "memory" | |||||
); | |||||
} | |||||
/* see bearssl_hash.h */ | |||||
br_ghash | |||||
br_ghash_pwr8_get(void) | |||||
{ | |||||
return &br_ghash_pwr8; | |||||
} | |||||
#else | |||||
/* see bearssl_hash.h */ | |||||
br_ghash | |||||
br_ghash_pwr8_get(void) | |||||
{ | |||||
return 0; | |||||
} | |||||
#endif |