GCC 8.2.0 -O2 -funroll-loops:

0000000000000060 <xor_and_encrypt_uint8_t>:
  60:   48 83 ec 18             sub    $0x18,%rsp
  64:   48 8b 02                mov    (%rdx),%rax
  67:   48 33 06                xor    (%rsi),%rax
  6a:   48 89 04 24             mov    %rax,(%rsp)
  6e:   48 8b 42 08             mov    0x8(%rdx),%rax
  72:   48 33 46 08             xor    0x8(%rsi),%rax
  76:   48 89 e6                mov    %rsp,%rsi
  79:   48 89 44 24 08          mov    %rax,0x8(%rsp)
  7e:   e8 00 00 00 00          callq  83 <xor_and_encrypt_uint8_t+0x23>
  83:   48 83 c4 18             add    $0x18,%rsp
  87:   c3                      retq

Clang 6.0.1 -O2 -mno-sse (-funroll-loops makes no difference, nor does -O3) (without -mno-sse, Clang unrolls to two 128-bit xmm register loads and a single xor, but we build kernel and libc with -mno-sse for obvious reasons and vectorization to 64-bit registers is still valuable).

0000000000000070 <xor_and_encrypt_uint8_t>:
  70:   55                      push   %rbp
  71:   48 89 e5                mov    %rsp,%rbp
  74:   48 83 ec 10             sub    $0x10,%rsp
  78:   8a 02                   mov    (%rdx),%al
  7a:   32 06                   xor    (%rsi),%al
  7c:   88 45 f0                mov    %al,-0x10(%rbp)
  7f:   8a 42 01                mov    0x1(%rdx),%al
  82:   32 46 01                xor    0x1(%rsi),%al
  85:   88 45 f1                mov    %al,-0xf(%rbp)
  88:   8a 42 02                mov    0x2(%rdx),%al
  8b:   32 46 02                xor    0x2(%rsi),%al
  8e:   88 45 f2                mov    %al,-0xe(%rbp)
  91:   8a 42 03                mov    0x3(%rdx),%al
  94:   32 46 03                xor    0x3(%rsi),%al
  97:   88 45 f3                mov    %al,-0xd(%rbp)
  9a:   8a 42 04                mov    0x4(%rdx),%al
  9d:   32 46 04                xor    0x4(%rsi),%al
  a0:   88 45 f4                mov    %al,-0xc(%rbp)
  a3:   8a 42 05                mov    0x5(%rdx),%al
  a6:   32 46 05                xor    0x5(%rsi),%al
  a9:   88 45 f5                mov    %al,-0xb(%rbp)
  ac:   8a 42 06                mov    0x6(%rdx),%al
  af:   32 46 06                xor    0x6(%rsi),%al
  b2:   88 45 f6                mov    %al,-0xa(%rbp)
  b5:   8a 42 07                mov    0x7(%rdx),%al
  b8:   32 46 07                xor    0x7(%rsi),%al
  bb:   88 45 f7                mov    %al,-0x9(%rbp)
  be:   8a 42 08                mov    0x8(%rdx),%al
  c1:   32 46 08                xor    0x8(%rsi),%al
  c4:   88 45 f8                mov    %al,-0x8(%rbp)
  c7:   8a 42 09                mov    0x9(%rdx),%al
  ca:   32 46 09                xor    0x9(%rsi),%al
  cd:   88 45 f9                mov    %al,-0x7(%rbp)
  d0:   8a 42 0a                mov    0xa(%rdx),%al
  d3:   32 46 0a                xor    0xa(%rsi),%al
  d6:   88 45 fa                mov    %al,-0x6(%rbp)
  d9:   8a 42 0b                mov    0xb(%rdx),%al
  dc:   32 46 0b                xor    0xb(%rsi),%al
  df:   88 45 fb                mov    %al,-0x5(%rbp)
  e2:   8a 42 0c                mov    0xc(%rdx),%al
  e5:   32 46 0c                xor    0xc(%rsi),%al
  e8:   88 45 fc                mov    %al,-0x4(%rbp)
  eb:   8a 42 0d                mov    0xd(%rdx),%al
  ee:   32 46 0d                xor    0xd(%rsi),%al
  f1:   88 45 fd                mov    %al,-0x3(%rbp)
  f4:   8a 42 0e                mov    0xe(%rdx),%al
  f7:   32 46 0e                xor    0xe(%rsi),%al
  fa:   88 45 fe                mov    %al,-0x2(%rbp)
  fd:   8a 42 0f                mov    0xf(%rdx),%al
 100:   32 46 0f                xor    0xf(%rsi),%al
 103:   88 45 ff                mov    %al,-0x1(%rbp)
 106:   48 8d 75 f0             lea    -0x10(%rbp),%rsi
 10a:   e8 00 00 00 00          callq  10f <xor_and_encrypt_uint8_t+0x9f>
 10f:   48 83 c4 10             add    $0x10,%rsp
 113:   5d                      pop    %rbp
 114:   c3                      retq


#define CCM_CBC_BLOCK_LEN 16

struct aes_cbc_mac_ctx;
extern void Y(struct aes_cbc_mac_ctx *, void *, uint8_t *);

#define X(COPY_TYPE) \
void \
xor_and_encrypt_ ## COPY_TYPE(struct aes_cbc_mac_ctx *ctx, \
                const uint8_t *src, uint8_t *dst) \
{ \
        const COPY_TYPE *b1; \
        COPY_TYPE *b2, *b3; \
        uint64_t temp_block[CCM_CBC_BLOCK_LEN/sizeof(uint64_t)]; \
\
        b1 = (const COPY_TYPE*)src; \
        b2 = (COPY_TYPE*)dst; \
        b3 = (COPY_TYPE*)temp_block; \
\
        for (size_t count = 0; count < CCM_CBC_BLOCK_LEN/sizeof(COPY_TYPE); \
            count++) \
                b3[count] = b1[count] ^ b2[count]; \
        Y(ctx, temp_block, dst); \
}

X(uint64_t)
X(uint32_t)
X(uint8_t)