Index: sys/amd64/amd64/support.S =================================================================== --- sys/amd64/amd64/support.S +++ sys/amd64/amd64/support.S @@ -697,6 +697,72 @@ ret END(fillw) +/* + * strlen(string) + * %rdi + * + * Uses the ((x - 0x01....01) & ~x & 0x80....80) trick. + * + * 0x01....01 is replaced with 0x0 - 0x01....01 so that it can be added + * with leaq. + * + * For a description see either: + * - "Hacker's Delight" by Henry S. Warren, Jr. for explanation. + * - "Optimizing subroutines in assembly language: An optimization guide for x86 platforms" + * by Agner Fog + * + * The latter contains a 32-bit variant of the same algorithm coded in assembly for i386. + */ +ENTRY(strlen) + PUSH_FRAME_POINTER + movabsq $0xfefefefefefefeff,%r8 + movabsq $0x8080808080808080,%r9 + + movq %rdi,%r10 + movq %rdi,%rcx + testb $7,%dil + jz 2f + + /* + * Handle misaligned reads: align to 8 and fill + * the spurious bytes. + */ + andq $~7,%rdi + movq (%rdi),%r11 + shlq $3,%rcx + movq $-1,%rdx + shlq %cl,%rdx + notq %rdx + orq %rdx,%r11 + + leaq (%r11,%r8),%rcx + notq %r11 + andq %r11,%rcx + andq %r9,%rcx + jnz 3f + + /* + * Main loop. + */ + ALIGN_TEXT +1: + leaq 8(%rdi),%rdi +2: + movq (%rdi),%r11 + leaq (%r11,%r8),%rcx + notq %r11 + andq %r11,%rcx + andq %r9,%rcx + jz 1b +3: + bsfq %rcx,%rcx + shrq $3,%rcx + leaq (%rcx,%rdi),%rax + subq %r10,%rax + POP_FRAME_POINTER + ret +END(strlen) + /*****************************************************************************/ /* copyout and fubyte family */ /*****************************************************************************/ Index: sys/conf/files.amd64 =================================================================== --- sys/conf/files.amd64 +++ sys/conf/files.amd64 @@ -390,7 +390,6 @@ isa/vga_isa.c optional vga kern/imgact_aout.c optional compat_aout kern/link_elf_obj.c standard -libkern/strlen.c standard # # IA32 binary support #