GCC8 v Clang6, vectorization of basic xor loop
ActivePublic
Actions

Authored by cem on Dec 27 2018, 11:08 PM.

Tags

None

Referenced Files

	F4100223: GCC8 v Clang6, vectorization of basic xor loop
	Dec 27 2018, 11:11 PM

	F4100220: GCC8 v Clang6, vectorization of basic xor loop
	Dec 27 2018, 11:10 PM

	F4100218: GCC8 v Clang6, vectorization of basic xor loop
	Dec 27 2018, 11:08 PM

Subscribers

None

	GCC 8.2.0 -O2 -funroll-loops:

	0000000000000060 <xor_and_encrypt_uint8_t>:
	60: 48 83 ec 18 sub $0x18,%rsp
	64: 48 8b 02 mov (%rdx),%rax
	67: 48 33 06 xor (%rsi),%rax
	6a: 48 89 04 24 mov %rax,(%rsp)
	6e: 48 8b 42 08 mov 0x8(%rdx),%rax
	72: 48 33 46 08 xor 0x8(%rsi),%rax
	76: 48 89 e6 mov %rsp,%rsi
	79: 48 89 44 24 08 mov %rax,0x8(%rsp)
	7e: e8 00 00 00 00 callq 83 <xor_and_encrypt_uint8_t+0x23>
	83: 48 83 c4 18 add $0x18,%rsp
	87: c3 retq

	Clang 6.0.1 -O2 -mno-sse (-funroll-loops makes no difference, nor does -O3) (without -mno-sse, Clang unrolls to two 128-bit xmm register loads and a single xor, but we build kernel and libc with -mno-sse for obvious reasons and vectorization to 64-bit registers is still valuable, especially compared to the following trash code).

	0000000000000070 <xor_and_encrypt_uint8_t>:
	70: 55 push %rbp
	71: 48 89 e5 mov %rsp,%rbp
	74: 48 83 ec 10 sub $0x10,%rsp
	78: 8a 02 mov (%rdx),%al
	7a: 32 06 xor (%rsi),%al
	7c: 88 45 f0 mov %al,-0x10(%rbp)
	7f: 8a 42 01 mov 0x1(%rdx),%al
	82: 32 46 01 xor 0x1(%rsi),%al
	85: 88 45 f1 mov %al,-0xf(%rbp)
	88: 8a 42 02 mov 0x2(%rdx),%al
	8b: 32 46 02 xor 0x2(%rsi),%al
	8e: 88 45 f2 mov %al,-0xe(%rbp)
	91: 8a 42 03 mov 0x3(%rdx),%al
	94: 32 46 03 xor 0x3(%rsi),%al
	97: 88 45 f3 mov %al,-0xd(%rbp)
	9a: 8a 42 04 mov 0x4(%rdx),%al
	9d: 32 46 04 xor 0x4(%rsi),%al
	a0: 88 45 f4 mov %al,-0xc(%rbp)
	a3: 8a 42 05 mov 0x5(%rdx),%al
	a6: 32 46 05 xor 0x5(%rsi),%al
	a9: 88 45 f5 mov %al,-0xb(%rbp)
	ac: 8a 42 06 mov 0x6(%rdx),%al
	af: 32 46 06 xor 0x6(%rsi),%al
	b2: 88 45 f6 mov %al,-0xa(%rbp)
	b5: 8a 42 07 mov 0x7(%rdx),%al
	b8: 32 46 07 xor 0x7(%rsi),%al
	bb: 88 45 f7 mov %al,-0x9(%rbp)
	be: 8a 42 08 mov 0x8(%rdx),%al
	c1: 32 46 08 xor 0x8(%rsi),%al
	c4: 88 45 f8 mov %al,-0x8(%rbp)
	c7: 8a 42 09 mov 0x9(%rdx),%al
	ca: 32 46 09 xor 0x9(%rsi),%al
	cd: 88 45 f9 mov %al,-0x7(%rbp)
	d0: 8a 42 0a mov 0xa(%rdx),%al
	d3: 32 46 0a xor 0xa(%rsi),%al
	d6: 88 45 fa mov %al,-0x6(%rbp)
	d9: 8a 42 0b mov 0xb(%rdx),%al
	dc: 32 46 0b xor 0xb(%rsi),%al
	df: 88 45 fb mov %al,-0x5(%rbp)
	e2: 8a 42 0c mov 0xc(%rdx),%al
	e5: 32 46 0c xor 0xc(%rsi),%al
	e8: 88 45 fc mov %al,-0x4(%rbp)
	eb: 8a 42 0d mov 0xd(%rdx),%al
	ee: 32 46 0d xor 0xd(%rsi),%al
	f1: 88 45 fd mov %al,-0x3(%rbp)
	f4: 8a 42 0e mov 0xe(%rdx),%al
	f7: 32 46 0e xor 0xe(%rsi),%al
	fa: 88 45 fe mov %al,-0x2(%rbp)
	fd: 8a 42 0f mov 0xf(%rdx),%al
	100: 32 46 0f xor 0xf(%rsi),%al
	103: 88 45 ff mov %al,-0x1(%rbp)
	106: 48 8d 75 f0 lea -0x10(%rbp),%rsi
	10a: e8 00 00 00 00 callq 10f <xor_and_encrypt_uint8_t+0x9f>
	10f: 48 83 c4 10 add $0x10,%rsp
	113: 5d pop %rbp
	114: c3 retq


	#define CCM_CBC_BLOCK_LEN 16

	struct aes_cbc_mac_ctx;
	extern void Y(struct aes_cbc_mac_ctx , void , uint8_t *);

	#define X(COPY_TYPE) \
	void \
	xor_and_encrypt_ ## COPY_TYPE(struct aes_cbc_mac_ctx *ctx, \
	const uint8_t src, uint8_t dst) \
	{ \
	const COPY_TYPE *b1; \
	COPY_TYPE b2, b3; \
	uint64_t temp_block[CCM_CBC_BLOCK_LEN/sizeof(uint64_t)]; \
	\
	b1 = (const COPY_TYPE*)src; \
	b2 = (COPY_TYPE*)dst; \
	b3 = (COPY_TYPE*)temp_block; \
	\
	for (size_t count = 0; count < CCM_CBC_BLOCK_LEN/sizeof(COPY_TYPE); \
	count++) \
	b3[count] = b1[count] ^ b2[count]; \
	Y(ctx, temp_block, dst); \
	}

	X(uint64_t)
	X(uint32_t)
	X(uint8_t)

Event Timeline

cem created this paste.Dec 27 2018, 11:08 PM

cem created this object in space S1 Global.

cem created this object with edit policy "cem (Conrad Meyer)".

cem edited the content of this paste. (Show Details)Dec 27 2018, 11:10 PM

cem edited the content of this paste. (Show Details)

GCC8 v Clang6, vectorization of basic xor loopActivePublicActions

Event Timeline

GCC8 v Clang6, vectorization of basic xor loop
ActivePublic
Actions