Changeset View
Changeset View
Standalone View
Standalone View
lib/libmd/amd64/md5block.S
- This file was added.
| /*- | |||||
| * Copyright (c) 2024, 2025 Robert Clausecker <fuz@FreeBSD.org> | |||||
| * | |||||
| * SPDX-License-Identifier: BSD-2-Clause | |||||
| */ | |||||
| #include <machine/asm.h> | |||||
| /* apply the round keys to the four round functions */ | |||||
| .macro allrounds rfn0, rfn1, rfn2, rfn3 | |||||
| \rfn0 0, 0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee | |||||
| \rfn0 4, 0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501 | |||||
| \rfn0 8, 0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be | |||||
| \rfn0 12, 0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821 | |||||
| \rfn1 16, 0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa | |||||
| \rfn1 20, 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8 | |||||
| \rfn1 24, 0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed | |||||
| \rfn1 28, 0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a | |||||
| \rfn2 32, 0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c | |||||
| \rfn2 36, 0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70 | |||||
| \rfn2 40, 0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x04881d05 | |||||
| \rfn2 44, 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665 | |||||
| \rfn3 48, 0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039 | |||||
| \rfn3 52, 0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1 | |||||
| \rfn3 56, 0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1 | |||||
| \rfn3 60, 0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391 | |||||
| .endm | |||||
| // md5block(MD5_CTX, buf, len) | |||||
| ENTRY(_libmd_md5block_baseline) | |||||
| .macro round a, b, c, d, f, k, m, s | |||||
| \f %ebp, \b, \c, \d | |||||
| add $\k, \a // a + k[i] | |||||
| add ((\m)%16*4)(%rsi), \a // a + k[i] + m[g] | |||||
| add %ebp, \a // a + k[i] + m[g] + f | |||||
| rol $\s, \a | |||||
| add \b, \a | |||||
| .endm | |||||
| // f = b ? c : d | |||||
| .macro f0 f, b, c, d | |||||
| mov \c, \f | |||||
| xor \d, \f | |||||
| and \b, \f | |||||
| xor \d, \f | |||||
| .endm | |||||
| // f = d ? b : c | |||||
| .macro f1 f, b, c, d | |||||
| mov \c, \f | |||||
| xor \b, \f | |||||
| and \d, \f | |||||
| xor \c, \f | |||||
| .endm | |||||
| // f = b ^ c ^ d | |||||
| .macro f2 f, b, c, d | |||||
| mov \c, \f | |||||
| xor \d, \f | |||||
| xor \b, \f | |||||
| .endm | |||||
| // f = c ^ (b | ~d) | |||||
| .macro f3 f, b, c, d | |||||
| mov $-1, \f | |||||
| xor \d, \f | |||||
| or \b, \f | |||||
| xor \c, \f | |||||
| .endm | |||||
| // do 4 rounds | |||||
| .macro rounds f, p, q, s0, s1, s2, s3, k0, k1, k2, k3 | |||||
| round %eax, %ebx, %ecx, %edx, \f, \k0, \p*0+\q, \s0 | |||||
| round %edx, %eax, %ebx, %ecx, \f, \k1, \p*1+\q, \s1 | |||||
| round %ecx, %edx, %eax, %ebx, \f, \k2, \p*2+\q, \s2 | |||||
| round %ebx, %ecx, %edx, %eax, \f, \k3, \p*3+\q, \s3 | |||||
| .endm | |||||
| // do 4 rounds with f0, f1, f2, f3 | |||||
| .macro rounds0 i, k0, k1, k2, k3 | |||||
| rounds f0, 1, \i, 7, 12, 17, 22, \k0, \k1, \k2, \k3 | |||||
| .endm | |||||
| .macro rounds1 i, k0, k1, k2, k3 | |||||
| rounds f1, 5, 5*\i+1, 5, 9, 14, 20, \k0, \k1, \k2, \k3 | |||||
| .endm | |||||
| .macro rounds2 i, k0, k1, k2, k3 | |||||
| rounds f2, 3, 3*\i+5, 4, 11, 16, 23, \k0, \k1, \k2, \k3 | |||||
| .endm | |||||
| .macro rounds3 i, k0, k1, k2, k3 | |||||
| rounds f3, 7, 7*\i, 6, 10, 15, 21, \k0, \k1, \k2, \k3 | |||||
| .endm | |||||
| push %rbx | |||||
| push %rbp | |||||
| push %r12 | |||||
| and $~63, %rdx // length in blocks | |||||
| lea (%rsi, %rdx, 1), %r12 // end pointer | |||||
| mov (%rdi), %eax // a | |||||
| mov 4(%rdi), %ebx // b | |||||
| mov 8(%rdi), %ecx // c | |||||
| mov 12(%rdi), %edx // d | |||||
| cmp %rsi, %r12 // any data to process? | |||||
| je .Lend | |||||
| .balign 16 | |||||
| .Lloop: mov %eax, %r8d | |||||
| mov %ebx, %r9d | |||||
| mov %ecx, %r10d | |||||
| mov %edx, %r11d | |||||
| allrounds rounds0, rounds1, rounds2, rounds3 | |||||
| add %r8d, %eax | |||||
| add %r9d, %ebx | |||||
| add %r10d, %ecx | |||||
| add %r11d, %edx | |||||
| add $64, %rsi | |||||
| cmp %rsi, %r12 | |||||
| jne .Lloop | |||||
| mov %eax, (%rdi) | |||||
| mov %ebx, 4(%rdi) | |||||
| mov %ecx, 8(%rdi) | |||||
| mov %edx, 12(%rdi) | |||||
| .Lend: pop %r12 | |||||
| pop %rbp | |||||
| pop %rbx | |||||
| ret | |||||
| END(_libmd_md5block_baseline) | |||||
| /* | |||||
| * An implementation leveraging the ANDN instruction | |||||
| * from BMI1 to shorten some dependency chains. | |||||
| */ | |||||
| ENTRY(_libmd_md5block_bmi1) | |||||
| // special-cased round 1 | |||||
| // f1 = d ? b : c = (d & b) + (~d & c) | |||||
| .macro round1 a, b, c, d, k, m, s | |||||
| andn \c, \d, %edi // ~d & c | |||||
| add $\k, \a // a + k[i] | |||||
| mov \d, %ebp | |||||
| add ((\m)%16*4)(%rsi), \a // a + k[i] + m[g] | |||||
| and \b, %ebp // d & b | |||||
| add %edi, \a // a + k[i] + m[g] + (~d & c) | |||||
| add %ebp, \a // a + k[i] + m[g] + (~d & c) + (d & b) | |||||
| rol $\s, \a | |||||
| add \b, \a | |||||
| .endm | |||||
| // special-cased round 3 | |||||
| // f3 = c ^ (b | ~d) = ~(c ^ ~b & d) = -1 - (c ^ ~b & d) | |||||
| .macro round3 a, b, c, d, k, m, s | |||||
| andn \d, \b, %ebp | |||||
| add $\k - 1, \a // a + k[i] - 1 | |||||
| add ((\m)%16*4)(%rsi), \a // a + k[i] + m[g] | |||||
| xor \c, %ebp | |||||
| sub %ebp, \a // a + k[i] + m[g] + f | |||||
| rol $\s, \a | |||||
| add \b, \a | |||||
| .endm | |||||
| .purgem rounds1 | |||||
| .macro rounds1 i, k0, k1, k2, k3 | |||||
| round1 %eax, %ebx, %ecx, %edx, \k0, 5*\i+ 1, 5 | |||||
| round1 %edx, %eax, %ebx, %ecx, \k1, 5*\i+ 6, 9 | |||||
| round1 %ecx, %edx, %eax, %ebx, \k2, 5*\i+11, 14 | |||||
| round1 %ebx, %ecx, %edx, %eax, \k3, 5*\i+16, 20 | |||||
| .endm | |||||
| .purgem rounds3 | |||||
| .macro rounds3 i, k0, k1, k2, k3 | |||||
| round3 %eax, %ebx, %ecx, %edx, \k0, 7*\i+ 0, 6 | |||||
| round3 %edx, %eax, %ebx, %ecx, \k1, 7*\i+ 7, 10 | |||||
| round3 %ecx, %edx, %eax, %ebx, \k2, 7*\i+14, 15 | |||||
| round3 %ebx, %ecx, %edx, %eax, \k3, 7*\i+21, 21 | |||||
| .endm | |||||
| push %rbx | |||||
| push %rbp | |||||
| push %r12 | |||||
| and $~63, %rdx // length in blocks | |||||
| lea (%rsi, %rdx, 1), %r12 // end pointer | |||||
| mov (%rdi), %eax // a | |||||
| mov 4(%rdi), %ebx // b | |||||
| mov 8(%rdi), %ecx // c | |||||
| mov 12(%rdi), %edx // d | |||||
| cmp %rsi, %r12 // any data to process? | |||||
| je 0f | |||||
| push %rdi | |||||
| .balign 16 | |||||
| 1: mov %eax, %r8d | |||||
| mov %ebx, %r9d | |||||
| mov %ecx, %r10d | |||||
| mov %edx, %r11d | |||||
| allrounds rounds0, rounds1, rounds2, rounds3 | |||||
| add %r8d, %eax | |||||
| add %r9d, %ebx | |||||
| add %r10d, %ecx | |||||
| add %r11d, %edx | |||||
| add $64, %rsi | |||||
| cmp %rsi, %r12 | |||||
| jne 1b | |||||
| pop %rdi | |||||
| mov %eax, (%rdi) | |||||
| mov %ebx, 4(%rdi) | |||||
| mov %ecx, 8(%rdi) | |||||
| mov %edx, 12(%rdi) | |||||
| 0: pop %r12 | |||||
| pop %rbp | |||||
| pop %rbx | |||||
| ret | |||||
| END(_libmd_md5block_bmi1) | |||||
| #ifndef _KERNEL | |||||
| /* | |||||
| * An implementation leveraging AVX-512 for its VPTERNLOGD | |||||
| * instruction. We're using only XMM registers here, | |||||
| * avoiding costly thermal licensing. | |||||
| */ | |||||
| ENTRY(_libmd_md5block_avx512) | |||||
| .macro vround a, b, c, d, f, i, m, mi, s | |||||
| vmovdqa \b, %xmm4 | |||||
| vpternlogd $\f, \d, \c, %xmm4 | |||||
| vpaddd 4*(\i)(%rax){1to4}, \m, %xmm5 // m[g] + k[i] | |||||
| .if \mi != 0 | |||||
| vpshufd $0x55 * \mi, %xmm5, %xmm5 // broadcast to each dword | |||||
| .endif | |||||
| vpaddd %xmm5, \a, \a // a + k[i] + m[g] | |||||
| vpaddd %xmm4, \a, \a // a + k[i] + m[g] + f | |||||
| vprold $\s, \a, \a | |||||
| vpaddd \b, \a, \a | |||||
| .endm | |||||
| .macro vrounds f, i, m0, i0, m1, i1, m2, i2, m3, i3, s0, s1, s2, s3 | |||||
| vround %xmm0, %xmm1, %xmm2, %xmm3, \f, \i+0, \m0, \i0, \s0 | |||||
| vround %xmm3, %xmm0, %xmm1, %xmm2, \f, \i+1, \m1, \i1, \s1 | |||||
| vround %xmm2, %xmm3, %xmm0, %xmm1, \f, \i+2, \m2, \i2, \s2 | |||||
| vround %xmm1, %xmm2, %xmm3, %xmm0, \f, \i+3, \m3, \i3, \s3 | |||||
| .endm | |||||
| /* | |||||
| * d c b f0 f1 f2 f3 | |||||
| * 0 0 0 0 0 0 1 | |||||
| * 1 0 0 1 0 1 0 | |||||
| * 0 1 0 0 1 1 0 | |||||
| * 1 1 0 1 0 0 1 | |||||
| * 0 0 1 0 0 1 1 | |||||
| * 1 0 1 0 1 0 1 | |||||
| * 0 1 1 1 1 0 0 | |||||
| * 1 1 1 1 1 1 0 | |||||
| */ | |||||
| .macro vrounds0 i, m | |||||
| vrounds 0xca, \i, \m, 0, \m, 1, \m, 2, \m, 3, 7, 12, 17, 22 | |||||
| .endm | |||||
| .macro vrounds1 i, m0, i0, m1, i1, m2, i2, m3, i3 | |||||
| vrounds 0xe4, \i, \m0, \i0, \m1, \i1, \m2, \i2, \m3, \i3, 5, 9, 14, 20 | |||||
| .endm | |||||
| .macro vrounds2 i, m0, i0, m1, i1, m2, i2, m3, i3 | |||||
| vrounds 0x96, \i, \m0, \i0, \m1, \i1, \m2, \i2, \m3, \i3, 4, 11, 16, 23 | |||||
| .endm | |||||
| .macro vrounds3 i, m0, i0, m1, i1, m2, i2, m3, i3 | |||||
| vrounds 0x39, \i, \m0, \i0, \m1, \i1, \m2, \i2, \m3, \i3, 6, 10, 15, 21 | |||||
| .endm | |||||
| and $~63, %rdx // length in blocks | |||||
| add %rsi, %rdx // end pointer | |||||
| vmovd (%rdi), %xmm0 // a | |||||
| vmovd 4(%rdi), %xmm1 // b | |||||
| vmovd 8(%rdi), %xmm2 // c | |||||
| vmovd 12(%rdi), %xmm3 // d | |||||
| lea keys(%rip), %rax | |||||
| cmp %rsi, %rdx // any data to process? | |||||
| je 0f | |||||
| .balign 16 | |||||
| 1: vmovdqu 0*4(%rsi), %xmm8 // message words | |||||
| vmovdqu 4*4(%rsi), %xmm9 | |||||
| vmovdqu 8*4(%rsi), %xmm10 | |||||
| vmovdqu 12*4(%rsi), %xmm11 | |||||
| vmovdqa %xmm0, %xmm12 // stash old state variables | |||||
| vmovdqa %xmm1, %xmm13 | |||||
| vmovdqa %xmm2, %xmm14 | |||||
| vmovdqa %xmm3, %xmm15 | |||||
| vrounds0 0, %xmm8 | |||||
| vrounds0 4, %xmm9 | |||||
| vrounds0 8, %xmm10 | |||||
| vrounds0 12, %xmm11 | |||||
| vrounds1 16, %xmm8, 1, %xmm9, 2, %xmm10, 3, %xmm8, 0 | |||||
| vrounds1 20, %xmm9, 1, %xmm10, 2, %xmm11, 3, %xmm9, 0 | |||||
| vrounds1 24, %xmm10, 1, %xmm11, 2, %xmm8, 3, %xmm10, 0 | |||||
| vrounds1 28, %xmm11, 1, %xmm8, 2, %xmm9, 3, %xmm11, 0 | |||||
| vrounds2 32, %xmm9, 1, %xmm10, 0, %xmm10, 3, %xmm11, 2 | |||||
| vrounds2 36, %xmm8, 1, %xmm9, 0, %xmm9, 3, %xmm10, 2 | |||||
| vrounds2 40, %xmm11, 1, %xmm8, 0, %xmm8, 3, %xmm9, 2 | |||||
| vrounds2 44 %xmm10, 1, %xmm11, 0, %xmm11, 3, %xmm8, 2 | |||||
| vrounds3 48, %xmm8, 0, %xmm9, 3, %xmm11, 2, %xmm9, 1 | |||||
| vrounds3 52, %xmm11, 0, %xmm8, 3, %xmm10, 2, %xmm8, 1 | |||||
| vrounds3 56, %xmm10, 0, %xmm11, 3, %xmm9, 2, %xmm11, 1 | |||||
| vrounds3 60, %xmm9, 0, %xmm10, 3, %xmm8, 2, %xmm10, 1 | |||||
| vpaddd %xmm12, %xmm0, %xmm0 | |||||
| vpaddd %xmm13, %xmm1, %xmm1 | |||||
| vpaddd %xmm14, %xmm2, %xmm2 | |||||
| vpaddd %xmm15, %xmm3, %xmm3 | |||||
| add $64, %rsi | |||||
| cmp %rsi, %rdx | |||||
| jne 1b | |||||
| vmovd %xmm0, (%rdi) | |||||
| vmovd %xmm1, 4(%rdi) | |||||
| vmovd %xmm2, 8(%rdi) | |||||
| vmovd %xmm3, 12(%rdi) | |||||
| 0: ret | |||||
| END(_libmd_md5block_avx512) | |||||
| // round keys, for use in md5block_avx512 | |||||
| .section .rodata | |||||
| .balign 16 | |||||
| .macro putkeys i, a, b, c, d | |||||
| .4byte \a, \b, \c, \d | |||||
| .endm | |||||
| keys: allrounds putkeys, putkeys, putkeys, putkeys | |||||
| .size keys, .-keys | |||||
| #endif /* !defined(_KERNEL) */ | |||||
| .section .note.GNU-stack,"",%progbits | |||||