Changeset View
Standalone View
sys/arm64/arm64/bzero.S
- This file was added.
/*- | |||||
* Copyright (C) 2016 Cavium Inc. | |||||
* All rights reserved. | |||||
emaste: Extra '*' | |||||
* | |||||
* Developed by Semihalf. | |||||
* | |||||
* Redistribution and use in source and binary forms, with or without | |||||
* modification, are permitted provided that the following conditions | |||||
* are met: | |||||
* 1. Redistributions of source code must retain the above copyright | |||||
* notice, this list of conditions and the following disclaimer. | |||||
* 2. Redistributions in binary form must reproduce the above copyright | |||||
* notice, this list of conditions and the following disclaimer in the | |||||
* documentation and/or other materials provided with the distribution. | |||||
* | |||||
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND | |||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE | |||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | |||||
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | |||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | |||||
* SUCH DAMAGE. | |||||
*/ | |||||
#include <machine/asm.h> | |||||
__FBSDID("$FreeBSD$"); | |||||
#include "assym.s" | |||||
/* | |||||
* void bzero(void *p, size_t size) | |||||
* | |||||
* x0 - p | |||||
* x1 - size | |||||
*/ | |||||
ENTRY(bzero) | |||||
cbz x1, ending | |||||
Not Done Inline ActionsDo we check if the pointer is NULL on other architectures? andrew: Do we check if the pointer is NULL on other architectures? | |||||
Not Done Inline ActionsNo, and such check is IMO wrong in principle. kib: No, and such check is IMO wrong in principle. | |||||
/* | |||||
Not Done Inline ActionsSuggest: if size is <= 8 goto lead_out and skip the rest. wma: Suggest: if size is <= 8 goto lead_out and skip the rest. | |||||
* x5 is number of cache lines to zero - calculated later and | |||||
* will become non-zero if buffer is long enough to zero by | |||||
Not Done Inline Actions-> zero wma: -> zero | |||||
Not Done Inline Actions/* * Multi-line comments look like this. Make them real sentences. Fill * them so they look like real paragraphs. */ (from style(9)) andrew: /*
* Multi-line comments look like this. Make them real sentences. Fill
* them so… | |||||
* cache lines (and if it is allowed.) | |||||
* We need to zero it before proceeding with buffers of size | |||||
* smaller than 16 bytes - otherwise the x5 will not be | |||||
Not Done Inline ActionsWhy? andrew: Why? | |||||
Not Done Inline Actions"normal" is used for: buffers shorter than 16bytes, to align address before using dc zva (stp) and to fill in what have left after dc zva (or last possible stp). x5 is calculated in 85 for buffers that are longer than 16 bytes and of course may be 0 after calculations (if size is not enough to use dc zva); in 175 we are checking if x5 is 0 - this is after execution of "normal" has ended - if we would be executing normal for buffers that are shorter than 16 bytes and x5 would not be set to 0 at 48 (before executing 52) then, after completion of execution for buffers with size shorter than 16 bytes, x5 value is random and we may attempt to perform "cache_line_zero". der_semihalf.com: "normal" is used for: buffers shorter than 16bytes, to align address before using dc zva (stp)… | |||||
Not Done Inline ActionsPlease convert this to a comment and add to the source code. kib: Please convert this to a comment and add to the source code. | |||||
* calculated and will retain random value. | |||||
* "normal" is used for buffers <= 16 bytes and to align buffer | |||||
* to cache line for buffers bigger than cache line; non-0 x5 | |||||
* after "normal" has completed indicates that it has been used | |||||
* to align buffer to cache line and now zero by cache lines will | |||||
* be performed, and x5 is amount of cache lines to loop through. | |||||
*/ | |||||
mov x5, xzr | |||||
Not Done Inline ActionsYou should read the size once in cache_setup(), then use the size read there in bzero. andrew: You should read the size once in cache_setup(), then use the size read there in bzero. | |||||
/* No use of cache assisted zero for buffers with size <= 16 */ | |||||
cmp x1, #0x10 | |||||
b.le normal | |||||
/* | |||||
* Load size of line that will be cleaned by dc zva call. | |||||
* 0 means that the instruction is not allowed | |||||
Not Done Inline ActionsI'm not sure how to parse this sentence. andrew: I'm not sure how to parse this sentence. | |||||
*/ | |||||
ldr x7, =dczva_line_size | |||||
Not Done Inline ActionsCan we simplify this part? I don't like using mul and div in performance critical code. Let's assume mrs x7, DCZID_EL0 sub x4, x7, x0 sub x5, x1, x4 add x6, x2, #1 /* cacheline size */ <lead_in> wma: Can we simplify this part? I don't like using mul and div in performance critical code.
Let's… | |||||
ldr x7, [x7] | |||||
cbz x7, normal | |||||
/* | |||||
* Buffer must be larger than cache line for using cache zeroing | |||||
* (and cache line aligned but this is checked after jump) | |||||
*/ | |||||
cmp x1, x7 | |||||
b.lt normal | |||||
/* | |||||
* Calculate number of bytes to cache aligned address (x4) nad | |||||
* number of full cache lines (x5). x6 is final address to zero. | |||||
*/ | |||||
sub x2, x7, #0x01 | |||||
mov x3, -1 | |||||
eor x3, x3, x2 | |||||
add x4, x0, x2 | |||||
and x4, x4, x3 | |||||
subs x4, x4, x0 | |||||
b.eq normal | |||||
/* Calculate number of "lines" in buffer */ | |||||
sub x5, x1, x4 | |||||
rbit x2, x7 | |||||
clz x2, x2 | |||||
lsr x5, x5, x2 | |||||
/* | |||||
* If number of cache lines is 0, we will not be able to zero | |||||
* by cache lines, so go normal way. | |||||
*/ | |||||
cbz x5, normal | |||||
/* x6 is final address to zero */ | |||||
add x6, x0, x1 | |||||
/* | |||||
* We are here because x5 is non-0 so normal will be used to | |||||
* align buffer before cache zeroing. x4 holds number of bytes | |||||
* needed for alignment. | |||||
*/ | |||||
mov x1, x4 | |||||
/* When jumping here: x0 holds pointer, x1 holds size */ | |||||
normal: | |||||
/* | |||||
* Get buffer offset into 16 byte aligned address; 0 means pointer | |||||
* is aligned. | |||||
*/ | |||||
ands x2, x0, #0x0f | |||||
b.eq aligned_to_16 | |||||
/* Calculate one-byte loop runs to 8 byte aligned address. */ | |||||
ands x2, x2, #0x07 | |||||
mov x3, #0x08 | |||||
sub x2, x3, x2 | |||||
/* x2 is number of bytes missing for alignment, x1 is buffer size */ | |||||
cmp x1, x2 | |||||
csel x2, x1, x2, le | |||||
sub x1, x1, x2 | |||||
/* | |||||
* Byte by byte copy will copy at least enough bytes to align | |||||
* pointer and at most "size". | |||||
*/ | |||||
align: | |||||
strb wzr, [x0], #0x01 | |||||
subs x2, x2, #0x01 | |||||
b.ne align | |||||
/* Now pointer is aligned to 8 bytes */ | |||||
cmp x1, #0x10 | |||||
b.lt lead_out | |||||
/* | |||||
* Check if copy of another 8 bytes is needed to align to 16 byte | |||||
* address and do it | |||||
*/ | |||||
tbz x0, #0x03, aligned_to_16 | |||||
str xzr, [x0], #0x08 | |||||
sub x1, x1, #0x08 | |||||
/* While jumping here: x0 is 16 byte alligned address, x1 is size */ | |||||
aligned_to_16: | |||||
/* If size is less than 16 bytes, use lead_out to copy what remains */ | |||||
cmp x1, #0x10 | |||||
b.lt lead_out | |||||
lsr x2, x1, #0x04 | |||||
zero_by_16: | |||||
stp xzr, xzr, [x0], #0x10 | |||||
subs x2, x2, #0x01 | |||||
b.ne zero_by_16 | |||||
/* | |||||
* Lead out requires addresses to be aligned to 8 bytes. It is used to | |||||
* zero buffers with sizes < 16 and what can not be zeroed by | |||||
Not Done Inline ActionsWe can move cacheline_zero to a separate function and use it directly in page_zero instead of calling bzero there. That should give us performance increase because we omit all alignment checking in place where it is guaranteed. wma: We can move cacheline_zero to a separate function and use it directly in page_zero instead of… | |||||
* zero_by_16 loop. | |||||
*/ | |||||
ands x1, x1, #0x0f | |||||
b.eq lead_out_end | |||||
lead_out: | |||||
tbz x1, #0x03, lead_out_dword | |||||
str xzr, [x0], #0x08 | |||||
lead_out_dword: | |||||
tbz x1, #0x02, lead_out_word | |||||
str wzr, [x0], #0x04 | |||||
lead_out_word: | |||||
tbz x1, #0x01, lead_out_byte | |||||
strh wzr, [x0], #0x02 | |||||
lead_out_byte: | |||||
tbz x1, #0x00, lead_out_end | |||||
strb wzr, [x0], #0x01 | |||||
lead_out_end: | |||||
/* | |||||
* If x5 is non-zero, this means that normal has been used as | |||||
* a lead in to align buffer address to cache size | |||||
*/ | |||||
cbz x5, ending | |||||
/* | |||||
* Here x5 holds number of lines to zero; x6 is final address of | |||||
* buffer. x0 is cache line aligned pointer. x7 is cache line size | |||||
* in bytes | |||||
*/ | |||||
cache_line_zero: | |||||
dc zva, x0 | |||||
add x0, x0, x7 | |||||
subs x5, x5, #0x01 | |||||
b.ne cache_line_zero | |||||
/* Need to zero remaining bytes? */ | |||||
subs x1, x6, x0 | |||||
b.ne normal | |||||
ending: | |||||
ret | |||||
END(bzero) | |||||
Extra '*'