Changeset View
Changeset View
Standalone View
Standalone View
lib/libc/powerpc64/string/bcopy.S
Show All 28 Lines | |||||
#include <machine/asm.h> | #include <machine/asm.h> | ||||
__FBSDID("$FreeBSD$"); | __FBSDID("$FreeBSD$"); | ||||
#define BLOCK_SIZE_BITS 6 | #define BLOCK_SIZE_BITS 6 | ||||
#define BLOCK_SIZE (1 << BLOCK_SIZE_BITS) | #define BLOCK_SIZE (1 << BLOCK_SIZE_BITS) | ||||
#define BLOCK_SIZE_MASK (BLOCK_SIZE - 1) | #define BLOCK_SIZE_MASK (BLOCK_SIZE - 1) | ||||
/* Minimum 8 byte alignment, to avoid cache-inhibited alignment faults.*/ | |||||
#ifndef ALIGN_MASK | |||||
#define ALIGN_MASK 0x7 | |||||
#endif | |||||
#define MULTI_PHASE_THRESHOLD 512 | #define MULTI_PHASE_THRESHOLD 512 | ||||
#ifndef FN_NAME | #ifndef FN_NAME | ||||
#ifdef MEMMOVE | #ifdef MEMMOVE | ||||
#define FN_NAME __memmove | #define FN_NAME __memmove | ||||
WEAK_REFERENCE(__memmove, memmove); | WEAK_REFERENCE(__memmove, memmove); | ||||
#else | #else | ||||
#define FN_NAME __bcopy | #define FN_NAME __bcopy | ||||
Show All 16 Lines | |||||
#ifdef MEMMOVE | #ifdef MEMMOVE | ||||
std %r3, -8(%r1) /* save dst */ | std %r3, -8(%r1) /* save dst */ | ||||
#else /* bcopy: swap src/dst */ | #else /* bcopy: swap src/dst */ | ||||
mr %r0, %r3 | mr %r0, %r3 | ||||
mr %r3, %r4 | mr %r3, %r4 | ||||
mr %r4, %r0 | mr %r4, %r0 | ||||
#endif | #endif | ||||
/* First check for relative alignment, if unaligned copy one byte at a time */ | |||||
andi. %r8, %r3, ALIGN_MASK | |||||
andi. %r7, %r4, ALIGN_MASK | |||||
cmpd %r7, %r8 | |||||
bne .Lunaligned | |||||
cmpldi %r5, MULTI_PHASE_THRESHOLD | cmpldi %r5, MULTI_PHASE_THRESHOLD | ||||
bge .Lmulti_phase | bge .Lmulti_phase | ||||
b .Lfast_copy | |||||
.Lunaligned: | |||||
/* forward or backward copy? */ | |||||
cmpd %r4, %r3 | |||||
blt .Lbackward_unaligned | |||||
/* Just need to setup increment and jump to copy */ | |||||
li %r0, 1 | |||||
mtctr %r5 | |||||
b .Lsingle_1_loop | |||||
.Lbackward_unaligned: | |||||
/* advance src and dst to last byte, set decrement and jump to copy */ | |||||
add %r3, %r3, %r5 | |||||
addi %r3, %r3, -1 | |||||
add %r4, %r4, %r5 | |||||
addi %r4, %r4, -1 | |||||
li %r0, -1 | |||||
mtctr %r5 | |||||
b .Lsingle_1_loop | |||||
.Lfast_copy: | |||||
/* align src */ | /* align src */ | ||||
luporl: .Lavoid_vsx leads to .Lsingle_copy, that end up copying double words, that require 8 byte… | |||||
cmpd %r4, %r3 /* forward or backward copy? */ | cmpd %r4, %r3 /* forward or backward copy? */ | ||||
blt .Lbackward_align | blt .Lbackward_align | ||||
.align 5 | .align 5 | ||||
.Lalign: | .Lalign: | ||||
andi. %r0, %r4, 15 | andi. %r0, %r4, 15 | ||||
beq .Lsingle_copy | beq .Lsingle_copy | ||||
lbz %r0, 0(%r4) | lbz %r0, 0(%r4) | ||||
Show All 20 Lines | .Lbackward_align_loop: | ||||
beq- .Ldone | beq- .Ldone | ||||
b .Lbackward_align_loop | b .Lbackward_align_loop | ||||
.Lsingle_copy: | .Lsingle_copy: | ||||
/* forward copy */ | /* forward copy */ | ||||
li %r0, 1 | li %r0, 1 | ||||
li %r8, 16 | li %r8, 16 | ||||
li %r9, 0 | li %r9, 0 | ||||
b .Lsingle_phase | b .Lsingle_phase | ||||
Not Done Inline ActionsStyle: add a space after /* and before */. luporl: Style: add a space after /* and before */. | |||||
.Lbackward_single_copy: | .Lbackward_single_copy: | ||||
Not Done Inline ActionsWhen src and dst are not aligned between themselves (%r6 == 0), you don't need to run most of the code between .Lavoid_vsx and here, especially the align loop, since data will be copied byte by byte. IMHO, when detecting that src and dst are not aligned, at the beginning of bcopy, you could just branch to a new label, setup increment/decrement and ctr, advance src and dst to end (in the backward copy case) and jump to single_1_loop. This may improve performance a bit. luporl: When src and dst are not aligned between themselves (%r6 == 0), you don't need to run most of… | |||||
/* backward copy */ | /* backward copy */ | ||||
li %r0, -1 | li %r0, -1 | ||||
li %r8, -16 | li %r8, -16 | ||||
li %r9, -15 | li %r9, -15 | ||||
/* point src and dst to last byte */ | /* point src and dst to last byte */ | ||||
addi %r3, %r3, -1 | addi %r3, %r3, -1 | ||||
addi %r4, %r4, -1 | addi %r4, %r4, -1 | ||||
Show All 23 Lines | |||||
.Lsingle_1: | .Lsingle_1: | ||||
andi. %r6, %r5, 0x0f /* number of 1-bytes */ | andi. %r6, %r5, 0x0f /* number of 1-bytes */ | ||||
beq .Ldone /* 1-bytes == 0? done */ | beq .Ldone /* 1-bytes == 0? done */ | ||||
mtctr %r6 | mtctr %r6 | ||||
.align 5 | .align 5 | ||||
.Lsingle_1_loop: | .Lsingle_1_loop: | ||||
lbz %r6, 0(%r4) | lbz %r6, 0(%r4) | ||||
add %r4, %r4, %r0 /* increment */ | add %r4, %r4, %r0 /* increment */ | ||||
Not Done Inline Actionss/ever/every/ luporl: s/ever/every/ | |||||
stb %r6, 0(%r3) | stb %r6, 0(%r3) | ||||
add %r3, %r3, %r0 /* increment */ | add %r3, %r3, %r0 /* increment */ | ||||
bdnz .Lsingle_1_loop | bdnz .Lsingle_1_loop | ||||
.Ldone: | .Ldone: | ||||
#ifdef MEMMOVE | #ifdef MEMMOVE | ||||
ld %r3, -8(%r1) /* restore dst */ | ld %r3, -8(%r1) /* restore dst */ | ||||
#endif | #endif | ||||
▲ Show 20 Lines • Show All 147 Lines • Show Last 20 Lines |
.Lavoid_vsx leads to .Lsingle_copy, that end up copying double words, that require 8 byte alignment with cache-inhibited storage.
When memory is not aligned you probably want to branch to .Lsingle_1 instead (it may need some adaptations though), that copies bytes.