#define CCM_CBC_BLOCK_LEN 16

struct aes_cbc_mac_ctx;
extern void Y(struct aes_cbc_mac_ctx *, void *, uint8_t *);

#define X(COPY_TYPE) \
void \
xor_and_encrypt_ ## COPY_TYPE(struct aes_cbc_mac_ctx *ctx, \
                const uint8_t *src, uint8_t *dst) \
{ \
        const COPY_TYPE *b1; \
        COPY_TYPE *b2, *b3; \
        uint64_t temp_block[CCM_CBC_BLOCK_LEN/sizeof(uint64_t)]; \
\
        b1 = (const COPY_TYPE*)src; \
        b2 = (COPY_TYPE*)dst; \
        b3 = (COPY_TYPE*)temp_block; \
\
        for (size_t count = 0; count < CCM_CBC_BLOCK_LEN/sizeof(COPY_TYPE); \
            count++) \
                b3[count] = b1[count] ^ b2[count]; \
        Y(ctx, temp_block, dst); \
}

X(uint64_t)
X(uint32_t)
X(uint8_t)

=============================================================================================

cc -Wall -Wextra -g $OPT -c test.c

=============================================================================================

GCC 8.2.0:

All three identical at -O2 -funroll-loops:

0000000000000060 <xor_and_encrypt_uint8_t>:
  60:   48 83 ec 18             sub    $0x18,%rsp
  64:   48 8b 02                mov    (%rdx),%rax
  67:   48 8b 4a 08             mov    0x8(%rdx),%rcx
  6b:   48 33 06                xor    (%rsi),%rax
  6e:   48 33 4e 08             xor    0x8(%rsi),%rcx
  72:   48 89 e6                mov    %rsp,%rsi
  75:   48 89 04 24             mov    %rax,(%rsp)
  79:   48 89 4c 24 08          mov    %rcx,0x8(%rsp)
  7e:   e8 00 00 00 00          callq  83 <xor_and_encrypt_uint8_t+0x23>
  83:   48 83 c4 18             add    $0x18,%rsp
  87:   c3                      retq

For some reason, that optimization isn't made at -O2 without -funroll, despite the code being shorter (0x28 == 40 bytes):

0000000000000060 <xor_and_encrypt_uint8_t>:
  60:   48 83 ec 18             sub    $0x18,%rsp
  64:   31 c0                   xor    %eax,%eax
  66:   66 2e 0f 1f 84 00 00    nopw   %cs:0x0(%rax,%rax,1)
  6d:   00 00 00
  70:   0f b6 0c 06             movzbl (%rsi,%rax,1),%ecx
  74:   32 0c 02                xor    (%rdx,%rax,1),%cl
  77:   88 0c 04                mov    %cl,(%rsp,%rax,1)
  7a:   48 83 c0 01             add    $0x1,%rax
  7e:   48 83 f8 10             cmp    $0x10,%rax
  82:   75 ec                   jne    70 <xor_and_encrypt_uint8_t+0x10>
  84:   48 89 e6                mov    %rsp,%rsi
  87:   e8 00 00 00 00          callq  8c <xor_and_encrypt_uint8_t+0x2c>
  8c:   48 83 c4 18             add    $0x18,%rsp
  90:   c3                      retq

(49 bytes)

-O2 -fpeel-loops also gets us down to 40 bytes, although the code is probably less good for less sophisticated (OOO) CPUs:

0000000000000060 <xor_and_encrypt_uint8_t>:
  60:   48 83 ec 18             sub    $0x18,%rsp
  64:   48 8b 02                mov    (%rdx),%rax  <<
  67:   48 33 06                xor    (%rsi),%rax  << data dependency
  6a:   48 89 04 24             mov    %rax,(%rsp)
  6e:   48 8b 42 08             mov    0x8(%rdx),%rax
  72:   48 33 46 08             xor    0x8(%rsi),%rax
  76:   48 89 e6                mov    %rsp,%rsi
  79:   48 89 44 24 08          mov    %rax,0x8(%rsp)
  7e:   e8 00 00 00 00          callq  83 <xor_and_encrypt_uint8_t+0x23>
  83:   48 83 c4 18             add    $0x18,%rsp
  87:   c3                      retq

=============================================================================================

Clang 6.0.1:

-O2 -mno-sse

it unrolls all loops super naively.  With uint64_t it's adequate (similar to GCC with -fpeel-loops but without -funroll-loops), but the uint8_t version is unrolled to 165 bytes of code (16 individual mov/mov/xors).  Same at -O3, or -funroll-loops.  May be better with Clang7, which is in head but I have not yet compiled it.  (And I can't seem to access pkg right now.)