Differential D5726 Diff 14810 sys/arm64/arm64/bzero.S

Changeset View

Standalone View

sys/arm64/arm64/bzero.S

This file was added.

				/*-
				* Copyright (C) 2016 Cavium Inc.
				* All rights reserved.
				emasteUnsubmitted Not Done Inline Actions Extra '' emaste:* Extra '*'
				*
				* Developed by Semihalf.
				*
				* Redistribution and use in source and binary forms, with or without
				* modification, are permitted provided that the following conditions
				* are met:
				* 1. Redistributions of source code must retain the above copyright
				* notice, this list of conditions and the following disclaimer.
				* 2. Redistributions in binary form must reproduce the above copyright
				* notice, this list of conditions and the following disclaimer in the
				* documentation and/or other materials provided with the distribution.
				*
				* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
				* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
				* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
				* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
				* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
				* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
				* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
				* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
				* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
				* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
				* SUCH DAMAGE.
				*/

				#include <machine/asm.h>
				__FBSDID("$FreeBSD$");


				#include "assym.s"

				/*
				* void bzero(void *p, size_t size)
				*
				* x0 - p
				* x1 - size
				*/
				ENTRY(bzero)
				cbz x1, ending

				andrewUnsubmitted Not Done Inline Actions Do we check if the pointer is NULL on other architectures? andrew: Do we check if the pointer is NULL on other architectures?
				kibUnsubmitted Not Done Inline Actions No, and such check is IMO wrong in principle. kib: No, and such check is IMO wrong in principle.
				/*
				wmaUnsubmitted Not Done Inline Actions Suggest: if size is <= 8 goto lead_out and skip the rest. wma: Suggest: if size is <= 8 goto lead_out and skip the rest.
				* x5 is number of cache lines to zero - calculated later and
				* will become non-zero if buffer is long enough to zero by
				wmaUnsubmitted Not Done Inline Actions -> zero wma: -> zero
				andrewUnsubmitted Not Done Inline Actions /* * Multi-line comments look like this. Make them real sentences. Fill * them so they look like real paragraphs. / (from `style(9)`) andrew:* /* * Multi-line comments look like this. Make them real sentences. Fill * them so…
				* cache lines (and if it is allowed.)
				* We need to zero it before proceeding with buffers of size
				* smaller than 16 bytes - otherwise the x5 will not be
				andrewUnsubmitted Not Done Inline Actions Why? andrew: Why?
				der_semihalf.comAuthorUnsubmitted Not Done Inline Actions "normal" is used for: buffers shorter than 16bytes, to align address before using dc zva (stp) and to fill in what have left after dc zva (or last possible stp). x5 is calculated in 85 for buffers that are longer than 16 bytes and of course may be 0 after calculations (if size is not enough to use dc zva); in 175 we are checking if x5 is 0 - this is after execution of "normal" has ended - if we would be executing normal for buffers that are shorter than 16 bytes and x5 would not be set to 0 at 48 (before executing 52) then, after completion of execution for buffers with size shorter than 16 bytes, x5 value is random and we may attempt to perform "cache_line_zero". der_semihalf.com: "normal" is used for: buffers shorter than 16bytes, to align address before using dc zva (stp)…
				kibUnsubmitted Not Done Inline Actions Please convert this to a comment and add to the source code. kib: Please convert this to a comment and add to the source code.
				* calculated and will retain random value.
				* "normal" is used for buffers <= 16 bytes and to align buffer
				* to cache line for buffers bigger than cache line; non-0 x5
				* after "normal" has completed indicates that it has been used
				* to align buffer to cache line and now zero by cache lines will
				* be performed, and x5 is amount of cache lines to loop through.
				*/
				mov x5, xzr
				andrewUnsubmitted Not Done Inline Actions You should read the size once in cache_setup(), then use the size read there in bzero. andrew: You should read the size once in cache_setup(), then use the size read there in bzero.

				/* No use of cache assisted zero for buffers with size <= 16 */
				cmp x1, #0x10
				b.le normal

				/*
				* Load size of line that will be cleaned by dc zva call.
				* 0 means that the instruction is not allowed
				andrewUnsubmitted Not Done Inline Actions I'm not sure how to parse this sentence. andrew: I'm not sure how to parse this sentence.
				*/
				ldr x7, =dczva_line_size
				wmaUnsubmitted Not Done Inline Actions Can we simplify this part? I don't like using mul and div in performance critical code. Let's assume mrs x7, DCZID_EL0 and x7, x7, 0x0f add x7, #2 /* x7 is log2 number of bytes in cache line / lsl x2, #1, x7 sub x2, x2, #1 / cacheline bitmask / sub x4, x7, x0 and x4, x4, x2 sub x5, x1, x4 lsr x5, x5, x7 add x6, x2, #1 / cacheline size / lsl x6, x6, x5 sub x6, x1, x6 sub x6, x6, x4 <lead_in> <dc zero> <lead_out> wma:* Can we simplify this part? I don't like using mul and div in performance critical code. Let's…
				ldr x7, [x7]
				cbz x7, normal

				/*
				* Buffer must be larger than cache line for using cache zeroing
				* (and cache line aligned but this is checked after jump)
				*/
				cmp x1, x7
				b.lt normal

				/*
				* Calculate number of bytes to cache aligned address (x4) nad
				* number of full cache lines (x5). x6 is final address to zero.
				*/
				sub x2, x7, #0x01
				mov x3, -1
				eor x3, x3, x2
				add x4, x0, x2
				and x4, x4, x3
				subs x4, x4, x0
				b.eq normal

				/* Calculate number of "lines" in buffer */
				sub x5, x1, x4
				rbit x2, x7
				clz x2, x2
				lsr x5, x5, x2

				/*
				* If number of cache lines is 0, we will not be able to zero
				* by cache lines, so go normal way.
				*/
				cbz x5, normal
				/* x6 is final address to zero */
				add x6, x0, x1

				/*
				* We are here because x5 is non-0 so normal will be used to
				* align buffer before cache zeroing. x4 holds number of bytes
				* needed for alignment.
				*/
				mov x1, x4

				/* When jumping here: x0 holds pointer, x1 holds size */
				normal:
				/*
				* Get buffer offset into 16 byte aligned address; 0 means pointer
				* is aligned.
				*/
				ands x2, x0, #0x0f
				b.eq aligned_to_16
				/* Calculate one-byte loop runs to 8 byte aligned address. */
				ands x2, x2, #0x07
				mov x3, #0x08
				sub x2, x3, x2
				/* x2 is number of bytes missing for alignment, x1 is buffer size */
				cmp x1, x2
				csel x2, x1, x2, le
				sub x1, x1, x2

				/*
				* Byte by byte copy will copy at least enough bytes to align
				* pointer and at most "size".
				*/
				align:
				strb wzr, [x0], #0x01
				subs x2, x2, #0x01
				b.ne align

				/* Now pointer is aligned to 8 bytes */
				cmp x1, #0x10
				b.lt lead_out
				/*
				* Check if copy of another 8 bytes is needed to align to 16 byte
				* address and do it
				*/
				tbz x0, #0x03, aligned_to_16
				str xzr, [x0], #0x08
				sub x1, x1, #0x08

				/* While jumping here: x0 is 16 byte alligned address, x1 is size */
				aligned_to_16:
				/* If size is less than 16 bytes, use lead_out to copy what remains */
				cmp x1, #0x10
				b.lt lead_out

				lsr x2, x1, #0x04
				zero_by_16:
				stp xzr, xzr, [x0], #0x10
				subs x2, x2, #0x01
				b.ne zero_by_16

				/*
				* Lead out requires addresses to be aligned to 8 bytes. It is used to
				* zero buffers with sizes < 16 and what can not be zeroed by
				wmaUnsubmitted Not Done Inline Actions We can move cacheline_zero to a separate function and use it directly in page_zero instead of calling bzero there. That should give us performance increase because we omit all alignment checking in place where it is guaranteed. wma: We can move cacheline_zero to a separate function and use it directly in page_zero instead of…
				* zero_by_16 loop.
				*/
				ands x1, x1, #0x0f
				b.eq lead_out_end
				lead_out:
				tbz x1, #0x03, lead_out_dword
				str xzr, [x0], #0x08
				lead_out_dword:
				tbz x1, #0x02, lead_out_word
				str wzr, [x0], #0x04
				lead_out_word:
				tbz x1, #0x01, lead_out_byte
				strh wzr, [x0], #0x02
				lead_out_byte:
				tbz x1, #0x00, lead_out_end
				strb wzr, [x0], #0x01

				lead_out_end:
				/*
				* If x5 is non-zero, this means that normal has been used as
				* a lead in to align buffer address to cache size
				*/
				cbz x5, ending

				/*
				* Here x5 holds number of lines to zero; x6 is final address of
				* buffer. x0 is cache line aligned pointer. x7 is cache line size
				* in bytes
				*/
				cache_line_zero:
				dc zva, x0
				add x0, x0, x7
				subs x5, x5, #0x01
				b.ne cache_line_zero

				/* Need to zero remaining bytes? */
				subs x1, x6, x0
				b.ne normal

				ending:
				ret

				END(bzero)