Differential D28776 Diff 84550 lib/libc/powerpc64/string/bcopy.S

Changeset View

Standalone View

lib/libc/powerpc64/string/bcopy.S

Show All 28 Lines

#include <machine/asm.h>		#include <machine/asm.h>
__FBSDID("$FreeBSD$");		__FBSDID("$FreeBSD$");

#define BLOCK_SIZE_BITS 6		#define BLOCK_SIZE_BITS 6
#define BLOCK_SIZE (1 << BLOCK_SIZE_BITS)		#define BLOCK_SIZE (1 << BLOCK_SIZE_BITS)
#define BLOCK_SIZE_MASK (BLOCK_SIZE - 1)		#define BLOCK_SIZE_MASK (BLOCK_SIZE - 1)

		/* Minimum 8 byte alignment, to avoid cache-inhibited alignment faults.*/
		#ifndef ALIGN_MASK
		#define ALIGN_MASK 0x7
		#endif

#define MULTI_PHASE_THRESHOLD 512		#define MULTI_PHASE_THRESHOLD 512

#ifndef FN_NAME		#ifndef FN_NAME
#ifdef MEMMOVE		#ifdef MEMMOVE
#define FN_NAME __memmove		#define FN_NAME __memmove
WEAK_REFERENCE(__memmove, memmove);		WEAK_REFERENCE(__memmove, memmove);
#else		#else
#define FN_NAME __bcopy		#define FN_NAME __bcopy
Show All 16 Lines
#ifdef MEMMOVE		#ifdef MEMMOVE
std %r3, -8(%r1) /* save dst */		std %r3, -8(%r1) /* save dst */
#else /* bcopy: swap src/dst */		#else /* bcopy: swap src/dst */
mr %r0, %r3		mr %r0, %r3
mr %r3, %r4		mr %r3, %r4
mr %r4, %r0		mr %r4, %r0
#endif		#endif

		/* First check for relative alignment, if unaligned copy one byte at a time */
		andi. %r8, %r3, ALIGN_MASK
		andi. %r7, %r4, ALIGN_MASK
		cmpd %r7, %r8
		bne .Lunaligned


cmpldi %r5, MULTI_PHASE_THRESHOLD		cmpldi %r5, MULTI_PHASE_THRESHOLD
bge .Lmulti_phase		bge .Lmulti_phase
		b .Lfast_copy

		.Lunaligned:
		/* forward or backward copy? */
		cmpd %r4, %r3
		blt .Lbackward_unaligned

		/* Just need to setup increment and jump to copy */
		li %r0, 1
		mtctr %r5
		b .Lsingle_1_loop

		.Lbackward_unaligned:
		/* advance src and dst to last byte, set decrement and jump to copy */
		add %r3, %r3, %r5
		addi %r3, %r3, -1
		add %r4, %r4, %r5
		addi %r4, %r4, -1
		li %r0, -1
		mtctr %r5
		b .Lsingle_1_loop

		.Lfast_copy:
/* align src */		/* align src */
		luporlUnsubmitted Not Done Inline Actions .Lavoid_vsx leads to .Lsingle_copy, that end up copying double words, that require 8 byte alignment with cache-inhibited storage. When memory is not aligned you probably want to branch to .Lsingle_1 instead (it may need some adaptations though), that copies bytes. luporl: .Lavoid_vsx leads to .Lsingle_copy, that end up copying double words, that require 8 byte…
cmpd %r4, %r3 /* forward or backward copy? */		cmpd %r4, %r3 /* forward or backward copy? */
blt .Lbackward_align		blt .Lbackward_align

.align 5		.align 5
.Lalign:		.Lalign:
andi. %r0, %r4, 15		andi. %r0, %r4, 15
beq .Lsingle_copy		beq .Lsingle_copy
lbz %r0, 0(%r4)		lbz %r0, 0(%r4)
Show All 20 Lines	.Lbackward_align_loop:
beq- .Ldone		beq- .Ldone
b .Lbackward_align_loop		b .Lbackward_align_loop

.Lsingle_copy:		.Lsingle_copy:
/* forward copy */		/* forward copy */
li %r0, 1		li %r0, 1
li %r8, 16		li %r8, 16
li %r9, 0		li %r9, 0
b .Lsingle_phase		b .Lsingle_phase
		luporlUnsubmitted Not Done Inline Actions Style: add a space after /* and before /. luporl:* Style: add a space after /* and before */.

.Lbackward_single_copy:		.Lbackward_single_copy:
		luporlUnsubmitted Not Done Inline Actions When src and dst are not aligned between themselves (%r6 == 0), you don't need to run most of the code between .Lavoid_vsx and here, especially the align loop, since data will be copied byte by byte. IMHO, when detecting that src and dst are not aligned, at the beginning of bcopy, you could just branch to a new label, setup increment/decrement and ctr, advance src and dst to end (in the backward copy case) and jump to single_1_loop. This may improve performance a bit. luporl: When src and dst are not aligned between themselves (%r6 == 0), you don't need to run most of…
/* backward copy */		/* backward copy */
li %r0, -1		li %r0, -1
li %r8, -16		li %r8, -16
li %r9, -15		li %r9, -15
/* point src and dst to last byte */		/* point src and dst to last byte */
addi %r3, %r3, -1		addi %r3, %r3, -1
addi %r4, %r4, -1		addi %r4, %r4, -1

Show All 23 Lines
.Lsingle_1:		.Lsingle_1:
andi. %r6, %r5, 0x0f /* number of 1-bytes */		andi. %r6, %r5, 0x0f /* number of 1-bytes */
beq .Ldone /* 1-bytes == 0? done */		beq .Ldone /* 1-bytes == 0? done */

mtctr %r6		mtctr %r6
.align 5		.align 5
.Lsingle_1_loop:		.Lsingle_1_loop:
lbz %r6, 0(%r4)		lbz %r6, 0(%r4)
add %r4, %r4, %r0 /* increment */		add %r4, %r4, %r0 /* increment */
		luporlUnsubmitted Not Done Inline Actions s/ever/every/ luporl: s/ever/every/
stb %r6, 0(%r3)		stb %r6, 0(%r3)
add %r3, %r3, %r0 /* increment */		add %r3, %r3, %r0 /* increment */
bdnz .Lsingle_1_loop		bdnz .Lsingle_1_loop

.Ldone:		.Ldone:
#ifdef MEMMOVE		#ifdef MEMMOVE
ld %r3, -8(%r1) /* restore dst */		ld %r3, -8(%r1) /* restore dst */
#endif		#endif
▲ Show 20 Lines • Show All 147 Lines • Show Last 20 Lines