diff --git a/lib/libc/amd64/string/Makefile.inc b/lib/libc/amd64/string/Makefile.inc index 3bc36078768b..4df4ff8f1417 100644 --- a/lib/libc/amd64/string/Makefile.inc +++ b/lib/libc/amd64/string/Makefile.inc @@ -1,13 +1,14 @@ MDSRCS+= \ amd64_archlevel.c \ bcmp.S \ memcmp.S \ memcpy.S \ memmove.S \ memset.S \ + stpcpy.S \ strcat.S \ strchrnul.S \ strcmp.S \ strlen.S \ - stpcpy.S + strcpy.c diff --git a/lib/libc/amd64/string/stpcpy.S b/lib/libc/amd64/string/stpcpy.S index 73c765556dc1..59358e3245a8 100644 --- a/lib/libc/amd64/string/stpcpy.S +++ b/lib/libc/amd64/string/stpcpy.S @@ -1,114 +1,237 @@ -/* - * Adapted by Guillaume Morin from strcpy.S - * written by J.T. Conklin - * Public domain. +/*- + * Copyright (c) 2023, The FreeBSD Foundation + * + * SPDX-License-Expression: BSD-2-Clause + * + * Portions of this software were developed by Robert Clausecker + * under sponsorship from the FreeBSD Foundation. + * + * Adapted from NetBSD's common/lib/libc/arch/x86_64/string/strcpy.S + * written by J.T. Conklin and + * adapted by Guillaume Morin to implement stpcpy + * that was originally dedicated to the public domain */ #include + +#include "amd64_archlevel.h" + +#define ALIGN_TEXT .p2align 4, 0x90 + + .weak stpcpy + .set stpcpy, __stpcpy +ARCHFUNCS(__stpcpy) + ARCHFUNC(__stpcpy, scalar) + ARCHFUNC(__stpcpy, baseline) +ENDARCHFUNCS(__stpcpy) + /* * This stpcpy implementation copies a byte at a time until the * source pointer is aligned to a word boundary, it then copies by * words until it finds a word containing a zero byte, and finally * copies by bytes until the end of the string is reached. * * While this may result in unaligned stores if the source and * destination pointers are unaligned with respect to each other, * it is still faster than either byte copies or the overhead of * an implementation suitable for machines with strict alignment * requirements. */ - .globl stpcpy,__stpcpy -ENTRY(stpcpy) -__stpcpy: +ARCHENTRY(__stpcpy, scalar) movabsq $0x0101010101010101,%r8 movabsq $0x8080808080808080,%r9 /* * Align source to a word boundary. * Consider unrolling loop? */ .Lalign: testb $7,%sil je .Lword_aligned movb (%rsi),%dl incq %rsi movb %dl,(%rdi) incq %rdi testb %dl,%dl jne .Lalign movq %rdi,%rax dec %rax ret - .p2align 4 + ALIGN_TEXT .Lloop: movq %rdx,(%rdi) addq $8,%rdi .Lword_aligned: movq (%rsi),%rdx movq %rdx,%rcx addq $8,%rsi subq %r8,%rcx testq %r9,%rcx je .Lloop /* * In rare cases, the above loop may exit prematurely. We must * return to the loop if none of the bytes in the word equal 0. */ movb %dl,(%rdi) testb %dl,%dl /* 1st byte == 0? */ je .Ldone incq %rdi shrq $8,%rdx movb %dl,(%rdi) testb %dl,%dl /* 2nd byte == 0? */ je .Ldone incq %rdi shrq $8,%rdx movb %dl,(%rdi) testb %dl,%dl /* 3rd byte == 0? */ je .Ldone incq %rdi shrq $8,%rdx movb %dl,(%rdi) testb %dl,%dl /* 4th byte == 0? */ je .Ldone incq %rdi shrq $8,%rdx movb %dl,(%rdi) testb %dl,%dl /* 5th byte == 0? */ je .Ldone incq %rdi shrq $8,%rdx movb %dl,(%rdi) testb %dl,%dl /* 6th byte == 0? */ je .Ldone incq %rdi shrq $8,%rdx movb %dl,(%rdi) testb %dl,%dl /* 7th byte == 0? */ je .Ldone incq %rdi shrq $8,%rdx movb %dl,(%rdi) incq %rdi testb %dl,%dl /* 8th byte == 0? */ jne .Lword_aligned decq %rdi .Ldone: movq %rdi,%rax ret -END(stpcpy) - +ARCHEND(__stpcpy, scalar) + +ARCHENTRY(__stpcpy, baseline) + mov %esi, %ecx + mov %rdi, %rdx + sub %rsi, %rdi # express destination as distance to surce + and $~0xf, %rsi # align source to 16 byte + movdqa (%rsi), %xmm0 # head of string with junk before + pxor %xmm1, %xmm1 + and $0xf, %ecx # misalignment in bytes + pcmpeqb %xmm1, %xmm0 # NUL byte present? + pmovmskb %xmm0, %eax + shr %cl, %eax # clear out matches in junk bytes + bsf %eax, %eax # find match if any + jnz .Lrunt + + /* first normal iteration: write head back if it succeeds */ + movdqa 16(%rsi), %xmm0 # 16 bytes of current iteration + movdqu (%rsi, %rcx, 1), %xmm2 # first 16 bytes of the string + pcmpeqb %xmm0, %xmm1 # NUL byte present? + pmovmskb %xmm1, %eax + test %eax, %eax # find match if any + jnz .Lshorty + + movdqu %xmm2, (%rdx) # store beginning of string + + /* main loop, unrolled twice */ + ALIGN_TEXT +0: movdqa 32(%rsi), %xmm2 # load current iteraion + movdqu %xmm0, 16(%rsi, %rdi, 1) # write back previous iteraion + pxor %xmm1, %xmm1 + add $32, %rsi + pcmpeqb %xmm2, %xmm1 # NUL byte present? + pmovmskb %xmm1, %eax + test %eax, %eax + jnz 1f + + movdqa 16(%rsi), %xmm0 # load current iteraion + movdqu %xmm2, (%rsi, %rdi, 1) # write back previous iteraion + pxor %xmm1, %xmm1 + pcmpeqb %xmm0, %xmm1 # NUL byte present? + pmovmskb %xmm1, %eax + test %eax, %eax + jz 0b + + /* end of string after main loop has iterated */ + add $16, %rsi # advance rsi to second unrolled half +1: tzcnt %eax, %eax # find location of match + # (behaves as bsf on pre-x86-64-v3 CPUs) + add %rsi, %rax # point to NUL byte + movdqu -15(%rax), %xmm0 # last 16 bytes of string + movdqu %xmm0, -15(%rax, %rdi, 1) # copied to destination + add %rdi, %rax # point to destination's NUL byte + ret + + /* NUL encountered in second iteration */ +.Lshorty: + tzcnt %eax, %eax + add $16, %eax # account for length of first iteration + sub %ecx, %eax # but not the parts before the string + + /* NUL encountered in first iteration */ +.Lrunt: lea 1(%rax), %edi # string length including NUL byte + add %rcx, %rsi # point to beginning of string + add %rdx, %rax # point to NUL byte + + /* transfer 16--32 bytes */ +.L1632: cmp $16, %edi + jb .L0815 + + movdqu -16(%rsi, %rdi, 1), %xmm0 # load last 16 bytes + movdqu %xmm2, (%rdx) # store first 16 bytes + movdqu %xmm0, -15(%rax) # store last 16 bytes + ret + + /* transfer 8--15 bytes */ +.L0815: cmp $8, %edi + jb .L0407 + + mov (%rsi), %rcx # load first 8 bytes + mov -8(%rsi, %rdi, 1), %rdi # load last 8 bytes + mov %rcx, (%rdx) # store to dst + mov %rdi, -7(%rax) # dito + ret + + /* transfer 4--7 bytes */ +.L0407: cmp $4, %edi + jb .L0203 + + mov (%rsi), %ecx + mov -4(%rsi, %rdi, 1), %edi + mov %ecx, (%rdx) + mov %edi, -3(%rax) + ret + + /* transfer 2--3 bytes */ +.L0203: cmp $2, %edi + jb .L0101 + + movzwl (%rsi), %ecx + mov %cx, (%rdx) # store first two bytes + + /* transfer 0 bytes (last byte is always NUL) */ +.L0101: movb $0, (%rax) # store terminating NUL byte + ret +ARCHEND(__stpcpy, baseline) + .section .note.GNU-stack,"",%progbits