xor loop unrolling test
ActivePublic
Actions

Authored by cem on Dec 27 2018, 11:01 PM.

Tags

None

Referenced Files

	F4100214: xor loop unrolling test
	Dec 27 2018, 11:01 PM

Subscribers

None

	#define CCM_CBC_BLOCK_LEN 16

	struct aes_cbc_mac_ctx;
	extern void Y(struct aes_cbc_mac_ctx , void , uint8_t *);

	#define X(COPY_TYPE) \
	void \
	xor_and_encrypt_ ## COPY_TYPE(struct aes_cbc_mac_ctx *ctx, \
	const uint8_t src, uint8_t dst) \
	{ \
	const COPY_TYPE *b1; \
	COPY_TYPE b2, b3; \
	uint64_t temp_block[CCM_CBC_BLOCK_LEN/sizeof(uint64_t)]; \
	\
	b1 = (const COPY_TYPE*)src; \
	b2 = (COPY_TYPE*)dst; \
	b3 = (COPY_TYPE*)temp_block; \
	\
	for (size_t count = 0; count < CCM_CBC_BLOCK_LEN/sizeof(COPY_TYPE); \
	count++) \
	b3[count] = b1[count] ^ b2[count]; \
	Y(ctx, temp_block, dst); \
	}

	X(uint64_t)
	X(uint32_t)
	X(uint8_t)

	=============================================================================================

	cc -Wall -Wextra -g $OPT -c test.c

	=============================================================================================

	GCC 8.2.0:

	All three identical at -O2 -funroll-loops:

	0000000000000060 <xor_and_encrypt_uint8_t>:
	60: 48 83 ec 18 sub $0x18,%rsp
	64: 48 8b 02 mov (%rdx),%rax
	67: 48 8b 4a 08 mov 0x8(%rdx),%rcx
	6b: 48 33 06 xor (%rsi),%rax
	6e: 48 33 4e 08 xor 0x8(%rsi),%rcx
	72: 48 89 e6 mov %rsp,%rsi
	75: 48 89 04 24 mov %rax,(%rsp)
	79: 48 89 4c 24 08 mov %rcx,0x8(%rsp)
	7e: e8 00 00 00 00 callq 83 <xor_and_encrypt_uint8_t+0x23>
	83: 48 83 c4 18 add $0x18,%rsp
	87: c3 retq

	For some reason, that optimization isn't made at -O2 without -funroll, despite the code being shorter (0x28 == 40 bytes):

	0000000000000060 <xor_and_encrypt_uint8_t>:
	60: 48 83 ec 18 sub $0x18,%rsp
	64: 31 c0 xor %eax,%eax
	66: 66 2e 0f 1f 84 00 00 nopw %cs:0x0(%rax,%rax,1)
	6d: 00 00 00
	70: 0f b6 0c 06 movzbl (%rsi,%rax,1),%ecx
	74: 32 0c 02 xor (%rdx,%rax,1),%cl
	77: 88 0c 04 mov %cl,(%rsp,%rax,1)
	7a: 48 83 c0 01 add $0x1,%rax
	7e: 48 83 f8 10 cmp $0x10,%rax
	82: 75 ec jne 70 <xor_and_encrypt_uint8_t+0x10>
	84: 48 89 e6 mov %rsp,%rsi
	87: e8 00 00 00 00 callq 8c <xor_and_encrypt_uint8_t+0x2c>
	8c: 48 83 c4 18 add $0x18,%rsp
	90: c3 retq

	(49 bytes)

	-O2 -fpeel-loops also gets us down to 40 bytes, although the code is probably less good for less sophisticated (OOO) CPUs:

	0000000000000060 <xor_and_encrypt_uint8_t>:
	60: 48 83 ec 18 sub $0x18,%rsp
	64: 48 8b 02 mov (%rdx),%rax <<
	67: 48 33 06 xor (%rsi),%rax << data dependency
	6a: 48 89 04 24 mov %rax,(%rsp)
	6e: 48 8b 42 08 mov 0x8(%rdx),%rax
	72: 48 33 46 08 xor 0x8(%rsi),%rax
	76: 48 89 e6 mov %rsp,%rsi
	79: 48 89 44 24 08 mov %rax,0x8(%rsp)
	7e: e8 00 00 00 00 callq 83 <xor_and_encrypt_uint8_t+0x23>
	83: 48 83 c4 18 add $0x18,%rsp
	87: c3 retq

	=============================================================================================

	Clang 6.0.1:

	-O2 -mno-sse

	it unrolls all loops super naively. With uint64_t it's adequate (similar to GCC with -fpeel-loops but without -funroll-loops), but the uint8_t version is unrolled to 165 bytes of code (16 individual mov/mov/xors). Same at -O3, or -funroll-loops. May be better with Clang7, which is in head but I have not yet compiled it. (And I can't seem to access pkg right now.)

Event Timeline

cem created this paste.Dec 27 2018, 11:01 PM

cem created this object in space S1 Global.

cem created this object with edit policy "cem (Conrad Meyer)".

xor loop unrolling testActivePublicActions

Event Timeline

xor loop unrolling test
ActivePublic
Actions