Page MenuHomeFreeBSD
Paste P242

GCC8 v Clang6, vectorization of basic xor loop
ActivePublic

Authored by cem on Dec 27 2018, 11:08 PM.
Tags
None
Referenced Files
F4100223: GCC8 v Clang6, vectorization of basic xor loop
Dec 27 2018, 11:11 PM
F4100220: GCC8 v Clang6, vectorization of basic xor loop
Dec 27 2018, 11:10 PM
F4100218: GCC8 v Clang6, vectorization of basic xor loop
Dec 27 2018, 11:08 PM
Subscribers
None
GCC 8.2.0 -O2 -funroll-loops:
0000000000000060 <xor_and_encrypt_uint8_t>:
60: 48 83 ec 18 sub $0x18,%rsp
64: 48 8b 02 mov (%rdx),%rax
67: 48 33 06 xor (%rsi),%rax
6a: 48 89 04 24 mov %rax,(%rsp)
6e: 48 8b 42 08 mov 0x8(%rdx),%rax
72: 48 33 46 08 xor 0x8(%rsi),%rax
76: 48 89 e6 mov %rsp,%rsi
79: 48 89 44 24 08 mov %rax,0x8(%rsp)
7e: e8 00 00 00 00 callq 83 <xor_and_encrypt_uint8_t+0x23>
83: 48 83 c4 18 add $0x18,%rsp
87: c3 retq
Clang 6.0.1 -O2 -mno-sse (-funroll-loops makes no difference, nor does -O3) (without -mno-sse, Clang unrolls to two 128-bit xmm register loads and a single xor, but we build kernel and libc with -mno-sse for obvious reasons and vectorization to 64-bit registers is still valuable, especially compared to the following trash code).
0000000000000070 <xor_and_encrypt_uint8_t>:
70: 55 push %rbp
71: 48 89 e5 mov %rsp,%rbp
74: 48 83 ec 10 sub $0x10,%rsp
78: 8a 02 mov (%rdx),%al
7a: 32 06 xor (%rsi),%al
7c: 88 45 f0 mov %al,-0x10(%rbp)
7f: 8a 42 01 mov 0x1(%rdx),%al
82: 32 46 01 xor 0x1(%rsi),%al
85: 88 45 f1 mov %al,-0xf(%rbp)
88: 8a 42 02 mov 0x2(%rdx),%al
8b: 32 46 02 xor 0x2(%rsi),%al
8e: 88 45 f2 mov %al,-0xe(%rbp)
91: 8a 42 03 mov 0x3(%rdx),%al
94: 32 46 03 xor 0x3(%rsi),%al
97: 88 45 f3 mov %al,-0xd(%rbp)
9a: 8a 42 04 mov 0x4(%rdx),%al
9d: 32 46 04 xor 0x4(%rsi),%al
a0: 88 45 f4 mov %al,-0xc(%rbp)
a3: 8a 42 05 mov 0x5(%rdx),%al
a6: 32 46 05 xor 0x5(%rsi),%al
a9: 88 45 f5 mov %al,-0xb(%rbp)
ac: 8a 42 06 mov 0x6(%rdx),%al
af: 32 46 06 xor 0x6(%rsi),%al
b2: 88 45 f6 mov %al,-0xa(%rbp)
b5: 8a 42 07 mov 0x7(%rdx),%al
b8: 32 46 07 xor 0x7(%rsi),%al
bb: 88 45 f7 mov %al,-0x9(%rbp)
be: 8a 42 08 mov 0x8(%rdx),%al
c1: 32 46 08 xor 0x8(%rsi),%al
c4: 88 45 f8 mov %al,-0x8(%rbp)
c7: 8a 42 09 mov 0x9(%rdx),%al
ca: 32 46 09 xor 0x9(%rsi),%al
cd: 88 45 f9 mov %al,-0x7(%rbp)
d0: 8a 42 0a mov 0xa(%rdx),%al
d3: 32 46 0a xor 0xa(%rsi),%al
d6: 88 45 fa mov %al,-0x6(%rbp)
d9: 8a 42 0b mov 0xb(%rdx),%al
dc: 32 46 0b xor 0xb(%rsi),%al
df: 88 45 fb mov %al,-0x5(%rbp)
e2: 8a 42 0c mov 0xc(%rdx),%al
e5: 32 46 0c xor 0xc(%rsi),%al
e8: 88 45 fc mov %al,-0x4(%rbp)
eb: 8a 42 0d mov 0xd(%rdx),%al
ee: 32 46 0d xor 0xd(%rsi),%al
f1: 88 45 fd mov %al,-0x3(%rbp)
f4: 8a 42 0e mov 0xe(%rdx),%al
f7: 32 46 0e xor 0xe(%rsi),%al
fa: 88 45 fe mov %al,-0x2(%rbp)
fd: 8a 42 0f mov 0xf(%rdx),%al
100: 32 46 0f xor 0xf(%rsi),%al
103: 88 45 ff mov %al,-0x1(%rbp)
106: 48 8d 75 f0 lea -0x10(%rbp),%rsi
10a: e8 00 00 00 00 callq 10f <xor_and_encrypt_uint8_t+0x9f>
10f: 48 83 c4 10 add $0x10,%rsp
113: 5d pop %rbp
114: c3 retq
#define CCM_CBC_BLOCK_LEN 16
struct aes_cbc_mac_ctx;
extern void Y(struct aes_cbc_mac_ctx *, void *, uint8_t *);
#define X(COPY_TYPE) \
void \
xor_and_encrypt_ ## COPY_TYPE(struct aes_cbc_mac_ctx *ctx, \
const uint8_t *src, uint8_t *dst) \
{ \
const COPY_TYPE *b1; \
COPY_TYPE *b2, *b3; \
uint64_t temp_block[CCM_CBC_BLOCK_LEN/sizeof(uint64_t)]; \
\
b1 = (const COPY_TYPE*)src; \
b2 = (COPY_TYPE*)dst; \
b3 = (COPY_TYPE*)temp_block; \
\
for (size_t count = 0; count < CCM_CBC_BLOCK_LEN/sizeof(COPY_TYPE); \
count++) \
b3[count] = b1[count] ^ b2[count]; \
Y(ctx, temp_block, dst); \
}
X(uint64_t)
X(uint32_t)
X(uint8_t)

Event Timeline

cem created this object in space S1 Global.
cem created this object with edit policy "cem (Conrad Meyer)".
cem edited the content of this paste. (Show Details)