Index: sys/amd64/amd64/support.S =================================================================== --- sys/amd64/amd64/support.S +++ sys/amd64/amd64/support.S @@ -106,7 +106,7 @@ * Adapted from bcopy written by: * ws@tools.de (Wolfgang Solfrank, TooLs GmbH) +49-228-985800 */ -ENTRY(memmove_std) +.macro MEMMOVE erms PUSH_FRAME_POINTER movq %rdi,%rax movq %rdx,%rcx @@ -116,72 +116,97 @@ cmpq %rcx,%r8 /* overlapping && src < dst? */ jb 2f - cmpq $15,%rcx - jbe 1f - shrq $3,%rcx /* copy by 64-bit words */ - rep - movsq + cmpq $128,%rcx + ja 100f + + shrq $3,%rcx + jz 20f + /* + * Handle 8-byte movs. + */ +10: + movq (%rsi),%r9 + movq %r9,(%rdi) + leaq 8(%rsi),%rsi + leaq 8(%rdi),%rdi + decb %cl + jnz 10b + /* + * Handle 1-byte movs. + */ +20: movq %rdx,%rcx - andq $7,%rcx /* any bytes left? */ - jne 1f + andb $7,%cl + jz 40f +30: + movb (%rsi),%dl + movb %dl,(%rdi) + leaq 1(%rsi),%rsi + leaq 1(%rdi),%rdi + decb %cl + jnz 30b +40: POP_FRAME_POINTER ret - ALIGN_TEXT -1: + + /* + * Handle big sizes with rep. + */ +100: +.if \erms == 1 rep movsb - POP_FRAME_POINTER - ret +.else + shrq $3,%rcx + rep + movsq + /* + * The tail (less than 8 bytes). + */ + movq %rdx,%rcx + andb $7,%cl + jne 30b +.endif + POP_FRAME_POINTER + ret - /* ALIGN_TEXT */ + /* + * Copy backwards. The code is left pessimized. + */ + ALIGN_TEXT 2: - addq %rcx,%rdi /* copy backwards */ + addq %rcx,%rdi addq %rcx,%rsi decq %rdi decq %rsi std - andq $7,%rcx /* any fractional bytes? */ +.if \erms == 1 + rep + movsb +.else + andq $7,%rcx je 3f rep movsb 3: - movq %rdx,%rcx /* copy remainder by 32-bit words */ + movq %rdx,%rcx shrq $3,%rcx subq $7,%rsi subq $7,%rdi rep movsq +.endif cld POP_FRAME_POINTER ret +.endm + +ENTRY(memmove_std) + MEMMOVE erms=0 END(memmove_std) ENTRY(memmove_erms) - PUSH_FRAME_POINTER - movq %rdi,%rax - movq %rdx,%rcx - - movq %rdi,%r8 - subq %rsi,%r8 - cmpq %rcx,%r8 /* overlapping && src < dst? */ - jb 1f - - rep - movsb - POP_FRAME_POINTER - ret - -1: - addq %rcx,%rdi /* copy backwards */ - addq %rcx,%rsi - decq %rdi - decq %rsi - std - rep - movsb - cld - POP_FRAME_POINTER - ret + MEMMOVE erms=1 END(memmove_erms) /* @@ -190,79 +215,137 @@ * * Note: memcpy does not support overlapping copies */ -ENTRY(memcpy_std) +.macro MEMCPY erms PUSH_FRAME_POINTER movq %rdi,%rax movq %rdx,%rcx - cmpq $15,%rcx - jbe 1f - shrq $3,%rcx /* copy by 64-bit words */ - rep - movsq + + cmpq $128,%rcx + ja 100f + + shrq $3,%rcx + jz 20f + /* + * Handle 8-byte movs. + */ +10: + movq (%rsi),%r9 + movq %r9,(%rdi) + leaq 8(%rsi),%rsi + leaq 8(%rdi),%rdi + decb %cl + jnz 10b + /* + * Handle 1-byte movs. + */ +20: movq %rdx,%rcx - andq $7,%rcx /* any bytes left? */ - jne 1f + andb $7,%cl + jz 40f +30: + movb (%rsi),%dl + movb %dl,(%rdi) + leaq 1(%rsi),%rsi + leaq 1(%rdi),%rdi + decb %cl + jnz 30b +40: POP_FRAME_POINTER ret - ALIGN_TEXT -1: + + /* + * Handle big sizes with rep. + */ +100: +.if \erms == 1 rep movsb - POP_FRAME_POINTER - ret +.else + shrq $3,%rcx + rep + movsq + /* + * The tail (less than 8 bytes). + */ + movq %rdx,%rcx + andb $7,%cl + jne 30b +.endif + POP_FRAME_POINTER + ret +.endm + +ENTRY(memcpy_std) + MEMCPY erms=0 END(memcpy_std) ENTRY(memcpy_erms) - PUSH_FRAME_POINTER - movq %rdi,%rax - movq %rdx,%rcx - rep - movsb - POP_FRAME_POINTER - ret + MEMCPY erms=1 END(memcpy_erms) /* * memset(dst, c, len) * rdi, rsi, rdx */ -ENTRY(memset_std) +.macro MEMSET erms PUSH_FRAME_POINTER movq %rdi,%r9 movq %rdx,%rcx +.if \erms == 1 + cmpq $128,%rcx + ja 50f +.endif movzbq %sil,%r8 movabs $0x0101010101010101,%rax imulq %r8,%rax - cmpq $15,%rcx - jbe 1f +.if \erms == 0 + cmpq $128,%rcx + ja 50f +.endif shrq $3,%rcx - rep - stosq + jz 20f +10: + movq %rax,(%rdi) + leaq 8(%rdi),%rdi + decl %ecx + jnz 10b +20: movq %rdx,%rcx - andq $7,%rcx - jne 1f - movq %r9,%rax + andb $7,%cl + jz 40f +30: + movb %al,(%rdi) + incq %rdi + decb %cl + jnz 30b +40: + movq %r9,%rax POP_FRAME_POINTER - ret - ALIGN_TEXT -1: + ret +50: +.if \erms == 1 + movb %sil,%al rep stosb +.else + shrq $3,%rcx + rep + stosq + movq %rdx,%rcx + andb $7,%cl + jne 30b +.endif movq %r9,%rax POP_FRAME_POINTER ret +.endm + +ENTRY(memset_std) + MEMSET erms=0 END(memset_std) ENTRY(memset_erms) - PUSH_FRAME_POINTER - movq %rdi,%r9 - movq %rdx,%rcx - movb %sil,%al - rep - stosb - movq %r9,%rax - POP_FRAME_POINTER - ret + MEMSET erms=1 END(memset_erms) /* fillw(pat, base, cnt) */