#define CCM_CBC_BLOCK_LEN 16 struct aes_cbc_mac_ctx; extern void Y(struct aes_cbc_mac_ctx *, void *, uint8_t *); #define X(COPY_TYPE) \ void \ xor_and_encrypt_ ## COPY_TYPE(struct aes_cbc_mac_ctx *ctx, \ const uint8_t *src, uint8_t *dst) \ { \ const COPY_TYPE *b1; \ COPY_TYPE *b2, *b3; \ uint64_t temp_block[CCM_CBC_BLOCK_LEN/sizeof(uint64_t)]; \ \ b1 = (const COPY_TYPE*)src; \ b2 = (COPY_TYPE*)dst; \ b3 = (COPY_TYPE*)temp_block; \ \ for (size_t count = 0; count < CCM_CBC_BLOCK_LEN/sizeof(COPY_TYPE); \ count++) \ b3[count] = b1[count] ^ b2[count]; \ Y(ctx, temp_block, dst); \ } X(uint64_t) X(uint32_t) X(uint8_t) ============================================================================================= cc -Wall -Wextra -g $OPT -c test.c ============================================================================================= GCC 8.2.0: All three identical at -O2 -funroll-loops: 0000000000000060 : 60: 48 83 ec 18 sub $0x18,%rsp 64: 48 8b 02 mov (%rdx),%rax 67: 48 8b 4a 08 mov 0x8(%rdx),%rcx 6b: 48 33 06 xor (%rsi),%rax 6e: 48 33 4e 08 xor 0x8(%rsi),%rcx 72: 48 89 e6 mov %rsp,%rsi 75: 48 89 04 24 mov %rax,(%rsp) 79: 48 89 4c 24 08 mov %rcx,0x8(%rsp) 7e: e8 00 00 00 00 callq 83 83: 48 83 c4 18 add $0x18,%rsp 87: c3 retq For some reason, that optimization isn't made at -O2 without -funroll, despite the code being shorter (0x28 == 40 bytes): 0000000000000060 : 60: 48 83 ec 18 sub $0x18,%rsp 64: 31 c0 xor %eax,%eax 66: 66 2e 0f 1f 84 00 00 nopw %cs:0x0(%rax,%rax,1) 6d: 00 00 00 70: 0f b6 0c 06 movzbl (%rsi,%rax,1),%ecx 74: 32 0c 02 xor (%rdx,%rax,1),%cl 77: 88 0c 04 mov %cl,(%rsp,%rax,1) 7a: 48 83 c0 01 add $0x1,%rax 7e: 48 83 f8 10 cmp $0x10,%rax 82: 75 ec jne 70 84: 48 89 e6 mov %rsp,%rsi 87: e8 00 00 00 00 callq 8c 8c: 48 83 c4 18 add $0x18,%rsp 90: c3 retq (49 bytes) -O2 -fpeel-loops also gets us down to 40 bytes, although the code is probably less good for less sophisticated (OOO) CPUs: 0000000000000060 : 60: 48 83 ec 18 sub $0x18,%rsp 64: 48 8b 02 mov (%rdx),%rax << 67: 48 33 06 xor (%rsi),%rax << data dependency 6a: 48 89 04 24 mov %rax,(%rsp) 6e: 48 8b 42 08 mov 0x8(%rdx),%rax 72: 48 33 46 08 xor 0x8(%rsi),%rax 76: 48 89 e6 mov %rsp,%rsi 79: 48 89 44 24 08 mov %rax,0x8(%rsp) 7e: e8 00 00 00 00 callq 83 83: 48 83 c4 18 add $0x18,%rsp 87: c3 retq ============================================================================================= Clang 6.0.1: -O2 -mno-sse it unrolls all loops super naively. With uint64_t it's adequate (similar to GCC with -fpeel-loops but without -funroll-loops), but the uint8_t version is unrolled to 165 bytes of code (16 individual mov/mov/xors). Same at -O3, or -funroll-loops. May be better with Clang7, which is in head but I have not yet compiled it. (And I can't seem to access pkg right now.)