Index: head/sys/arm/arm/bcopyinout_xscale.S
===================================================================
--- head/sys/arm/arm/bcopyinout_xscale.S	(revision 368152)
+++ head/sys/arm/arm/bcopyinout_xscale.S	(revision 368153)
@@ -1,951 +1,819 @@
 /*	$NetBSD: bcopyinout_xscale.S,v 1.3 2003/12/15 09:27:18 scw Exp $	*/
 
 /*-
  * Copyright 2003 Wasabi Systems, Inc.
  * All rights reserved.
  *
  * Written by Steve C. Woodford for Wasabi Systems, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *      This product includes software developed for the NetBSD Project by
  *      Wasabi Systems, Inc.
  * 4. The name of Wasabi Systems, Inc. may not be used to endorse
  *    or promote products derived from this software without specific prior
  *    written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <machine/asm.h>
 __FBSDID("$FreeBSD$");
 
 	.syntax	unified
 	.text
 	.align	2
 
 #define GET_PCB(tmp) \
 	mrc p15, 0, tmp, c13, c0, 4; \
 	add	tmp, tmp, #(TD_PCB)
 
 /*
  * r0 = user space address
  * r1 = kernel space address
  * r2 = length
  *
  * Copies bytes from user space to kernel space
  */
 ENTRY(copyin)
 	cmp	r2, #0x00
 	movle	r0, #0x00
 	movle	pc, lr			/* Bail early if length is <= 0 */
 
 	adds	r3, r0, r2
 	movcs	r0, #EFAULT
 	RETc(cs)
 
 	ldr	r12, =(VM_MAXUSER_ADDRESS + 1)
 	cmp	r3, r12
 	movcs	r0, #EFAULT
 	RETc(cs)
 
 	ldr	r3, .L_arm_memcpy
 	ldr	r3, [r3]
 	cmp	r3, #0
 	beq	.Lnormal
 	ldr	r3, .L_min_memcpy_size
 	ldr	r3, [r3]
 	cmp	r2, r3
 	blt	.Lnormal
 	stmfd	sp!, {r0-r2, r4, lr}
 	mov     r3, r0
 	mov     r0, r1
 	mov     r1, r3
 	mov     r3, #2 /* SRC_IS_USER */
 	ldr	r4, .L_arm_memcpy
 	mov	lr, pc
 	ldr	pc, [r4]
 	cmp     r0, #0
 	ldmfd   sp!, {r0-r2, r4, lr}
 	moveq	r0, #0
 	RETeq
 
 .Lnormal:
 	stmfd	sp!, {r10-r11, lr}
 
 	GET_PCB(r10)
 	ldr	r10, [r10]
 
 	mov	r3, #0x00
 	adr	ip, .Lcopyin_fault
 	ldr	r11, [r10, #PCB_ONFAULT]
 	str	ip, [r10, #PCB_ONFAULT]
 	bl	.Lcopyin_guts
 	str	r11, [r10, #PCB_ONFAULT]
 	mov	r0, #0x00
 	ldmfd	sp!, {r10-r11, pc}
 
 .Lcopyin_fault:
 	ldr	r0, =EFAULT
 	str	r11, [r10, #PCB_ONFAULT]
 	cmp	r3, #0x00
 	ldmfdgt	sp!, {r4-r7}		/* r3 > 0 Restore r4-r7 */
 	ldmfdlt	sp!, {r4-r9}		/* r3 < 0 Restore r4-r9 */
 	ldmfd	sp!, {r10-r11, pc}
 
 .Lcopyin_guts:
 	pld	[r0]
 	/* Word-align the destination buffer */
 	ands	ip, r1, #0x03		/* Already word aligned? */
 	beq	.Lcopyin_wordaligned	/* Yup */
 	rsb	ip, ip, #0x04
 	cmp	r2, ip			/* Enough bytes left to align it? */
 	blt	.Lcopyin_l4_2		/* Nope. Just copy bytewise */
 	sub	r2, r2, ip
 	rsbs	ip, ip, #0x03
 	addne	pc, pc, ip, lsl #3
 	nop
 	ldrbt	ip, [r0], #0x01
 	strb	ip, [r1], #0x01
 	ldrbt	ip, [r0], #0x01
 	strb	ip, [r1], #0x01
 	ldrbt	ip, [r0], #0x01
 	strb	ip, [r1], #0x01
 	cmp	r2, #0x00		/* All done? */
 	RETeq
 
 	/* Destination buffer is now word aligned */
 .Lcopyin_wordaligned:
 	ands	ip, r0, #0x03		/* Is src also word-aligned? */
 	bne	.Lcopyin_bad_align	/* Nope. Things just got bad */
 	cmp	r2, #0x08		/* Less than 8 bytes remaining? */
 	blt	.Lcopyin_w_less_than8
 
 	/* Quad-align the destination buffer */
 	tst	r1, #0x07		/* Already quad aligned? */
 	ldrtne	ip, [r0], #0x04
 	strne	ip, [r1], #0x04
 	subne	r2, r2, #0x04
 	stmfd	sp!, {r4-r9}		/* Free up some registers */
 	mov	r3, #-1			/* Signal restore r4-r9 */
 
 	/* Destination buffer quad aligned, source is word aligned */
 	subs	r2, r2, #0x80
 	blt	.Lcopyin_w_lessthan128
 
 	/* Copy 128 bytes at a time */
 .Lcopyin_w_loop128:
 	ldrt	r4, [r0], #0x04		/* LD:00-03 */
 	ldrt	r5, [r0], #0x04		/* LD:04-07 */
 	pld	[r0, #0x18]		/* Prefetch 0x20 */
 	ldrt	r6, [r0], #0x04		/* LD:08-0b */
 	ldrt	r7, [r0], #0x04		/* LD:0c-0f */
 	ldrt	r8, [r0], #0x04		/* LD:10-13 */
 	ldrt	r9, [r0], #0x04		/* LD:14-17 */
 	strd	r4, [r1], #0x08		/* ST:00-07 */
 	ldrt	r4, [r0], #0x04		/* LD:18-1b */
 	ldrt	r5, [r0], #0x04		/* LD:1c-1f */
 	strd	r6, [r1], #0x08		/* ST:08-0f */
 	ldrt	r6, [r0], #0x04		/* LD:20-23 */
 	ldrt	r7, [r0], #0x04		/* LD:24-27 */
 	pld	[r0, #0x18]		/* Prefetch 0x40 */
 	strd	r8, [r1], #0x08		/* ST:10-17 */
 	ldrt	r8, [r0], #0x04		/* LD:28-2b */
 	ldrt	r9, [r0], #0x04		/* LD:2c-2f */
 	strd	r4, [r1], #0x08		/* ST:18-1f */
 	ldrt	r4, [r0], #0x04		/* LD:30-33 */
 	ldrt	r5, [r0], #0x04		/* LD:34-37 */
 	strd	r6, [r1], #0x08		/* ST:20-27 */
 	ldrt	r6, [r0], #0x04		/* LD:38-3b */
 	ldrt	r7, [r0], #0x04		/* LD:3c-3f */
 	strd	r8, [r1], #0x08		/* ST:28-2f */
 	ldrt	r8, [r0], #0x04		/* LD:40-43 */
 	ldrt	r9, [r0], #0x04		/* LD:44-47 */
 	pld	[r0, #0x18]		/* Prefetch 0x60 */
 	strd	r4, [r1], #0x08		/* ST:30-37 */
 	ldrt	r4, [r0], #0x04		/* LD:48-4b */
 	ldrt	r5, [r0], #0x04		/* LD:4c-4f */
 	strd	r6, [r1], #0x08		/* ST:38-3f */
 	ldrt	r6, [r0], #0x04		/* LD:50-53 */
 	ldrt	r7, [r0], #0x04		/* LD:54-57 */
 	strd	r8, [r1], #0x08		/* ST:40-47 */
 	ldrt	r8, [r0], #0x04		/* LD:58-5b */
 	ldrt	r9, [r0], #0x04		/* LD:5c-5f */
 	strd	r4, [r1], #0x08		/* ST:48-4f */
 	ldrt	r4, [r0], #0x04		/* LD:60-63 */
 	ldrt	r5, [r0], #0x04		/* LD:64-67 */
 	pld	[r0, #0x18]		/* Prefetch 0x80 */
 	strd	r6, [r1], #0x08		/* ST:50-57 */
 	ldrt	r6, [r0], #0x04		/* LD:68-6b */
 	ldrt	r7, [r0], #0x04		/* LD:6c-6f */
 	strd	r8, [r1], #0x08		/* ST:58-5f */
 	ldrt	r8, [r0], #0x04		/* LD:70-73 */
 	ldrt	r9, [r0], #0x04		/* LD:74-77 */
 	strd	r4, [r1], #0x08		/* ST:60-67 */
 	ldrt	r4, [r0], #0x04		/* LD:78-7b */
 	ldrt	r5, [r0], #0x04		/* LD:7c-7f */
 	strd	r6, [r1], #0x08		/* ST:68-6f */
 	strd	r8, [r1], #0x08		/* ST:70-77 */
 	subs	r2, r2, #0x80
 	strd	r4, [r1], #0x08		/* ST:78-7f */
 	bge	.Lcopyin_w_loop128
 
 .Lcopyin_w_lessthan128:
 	adds	r2, r2, #0x80		/* Adjust for extra sub */
 	ldmfdeq	sp!, {r4-r9}
 	RETeq
 	subs	r2, r2, #0x20
 	blt	.Lcopyin_w_lessthan32
 
 	/* Copy 32 bytes at a time */
 .Lcopyin_w_loop32:
 	ldrt	r4, [r0], #0x04
 	ldrt	r5, [r0], #0x04
 	pld	[r0, #0x18]
 	ldrt	r6, [r0], #0x04
 	ldrt	r7, [r0], #0x04
 	ldrt	r8, [r0], #0x04
 	ldrt	r9, [r0], #0x04
 	strd	r4, [r1], #0x08
 	ldrt	r4, [r0], #0x04
 	ldrt	r5, [r0], #0x04
 	strd	r6, [r1], #0x08
 	strd	r8, [r1], #0x08
 	subs	r2, r2, #0x20
 	strd	r4, [r1], #0x08
 	bge	.Lcopyin_w_loop32
 
 .Lcopyin_w_lessthan32:
 	adds	r2, r2, #0x20		/* Adjust for extra sub */
 	ldmfdeq	sp!, {r4-r9}
 	RETeq				/* Return now if done */
 
 	and	r4, r2, #0x18
 	rsb	r5, r4, #0x18
 	subs	r2, r2, r4
 	add	pc, pc, r5, lsl #1
 	nop
 
 	/* At least 24 bytes remaining */
 	ldrt	r4, [r0], #0x04
 	ldrt	r5, [r0], #0x04
 	nop
 	strd	r4, [r1], #0x08
 
 	/* At least 16 bytes remaining */
 	ldrt	r4, [r0], #0x04
 	ldrt	r5, [r0], #0x04
 	nop
 	strd	r4, [r1], #0x08
 
 	/* At least 8 bytes remaining */
 	ldrt	r4, [r0], #0x04
 	ldrt	r5, [r0], #0x04
 	nop
 	strd	r4, [r1], #0x08
 
 	/* Less than 8 bytes remaining */
 	ldmfd	sp!, {r4-r9}
 	RETeq				/* Return now if done */
 	mov	r3, #0x00
 
 .Lcopyin_w_less_than8:
 	subs	r2, r2, #0x04
 	ldrtge	ip, [r0], #0x04
 	strge	ip, [r1], #0x04
 	RETeq				/* Return now if done */
 	addlt	r2, r2, #0x04
 	ldrbt	ip, [r0], #0x01
 	cmp	r2, #0x02
 	ldrbtge	r2, [r0], #0x01
 	strb	ip, [r1], #0x01
 	ldrbtgt	ip, [r0]
 	strbge	r2, [r1], #0x01
 	strbgt	ip, [r1]
 	RET
 
 /*
  * At this point, it has not been possible to word align both buffers.
  * The destination buffer (r1) is word aligned, but the source buffer
  * (r0) is not.
  */
 .Lcopyin_bad_align:
 	stmfd	sp!, {r4-r7}
 	mov	r3, #0x01
 	bic	r0, r0, #0x03
 	cmp	ip, #2
 	ldrt	ip, [r0], #0x04
 	bgt	.Lcopyin_bad3
 	beq	.Lcopyin_bad2
 	b	.Lcopyin_bad1
 
 .Lcopyin_bad1_loop16:
-#ifdef __ARMEB__
-	mov	r4, ip, lsl #8
-#else
 	mov	r4, ip, lsr #8
-#endif
 	ldrt	r5, [r0], #0x04
 	pld	[r0, #0x018]
 	ldrt	r6, [r0], #0x04
 	ldrt	r7, [r0], #0x04
 	ldrt	ip, [r0], #0x04
-#ifdef __ARMEB__
-	orr	r4, r4, r5, lsr #24
-	mov	r5, r5, lsl #8
-	orr	r5, r5, r6, lsr #24
-	mov	r6, r6, lsl #8
-	orr	r6, r6, r7, lsr #24
-	mov	r7, r7, lsl #8
-	orr	r7, r7, ip, lsr #24
-#else
 	orr	r4, r4, r5, lsl #24
 	mov	r5, r5, lsr #8
 	orr	r5, r5, r6, lsl #24
 	mov	r6, r6, lsr #8
 	orr	r6, r6, r7, lsl #24
 	mov	r7, r7, lsr #8
 	orr	r7, r7, ip, lsl #24
-#endif
 	str	r4, [r1], #0x04
 	str	r5, [r1], #0x04
 	str	r6, [r1], #0x04
 	str	r7, [r1], #0x04
 .Lcopyin_bad1:
 	subs	r2, r2, #0x10
 	bge	.Lcopyin_bad1_loop16
 
 	adds	r2, r2, #0x10
 	ldmfdeq	sp!, {r4-r7}
 	RETeq				/* Return now if done */
 	subs	r2, r2, #0x04
 	sublt	r0, r0, #0x03
 	blt	.Lcopyin_l4
 
 .Lcopyin_bad1_loop4:
-#ifdef __ARMEB__
-	mov	r4, ip, lsl #8
-#else
 	mov	r4, ip, lsr #8
-#endif
 	ldrt	ip, [r0], #0x04
 	subs	r2, r2, #0x04
-#ifdef __ARMEB__
-	orr	r4, r4, ip, lsr #24
-#else
 	orr	r4, r4, ip, lsl #24
-#endif
 	str	r4, [r1], #0x04
 	bge	.Lcopyin_bad1_loop4
 	sub	r0, r0, #0x03
 	b	.Lcopyin_l4
 
 .Lcopyin_bad2_loop16:
-#ifdef __ARMEB__
-	mov	r4, ip, lsl #16
-#else
 	mov	r4, ip, lsr #16
-#endif
 	ldrt	r5, [r0], #0x04
 	pld	[r0, #0x018]
 	ldrt	r6, [r0], #0x04
 	ldrt	r7, [r0], #0x04
 	ldrt	ip, [r0], #0x04
-#ifdef __ARMEB__
-	orr	r4, r4, r5, lsr #16
-	mov	r5, r5, lsl #16
-	orr	r5, r5, r6, lsr #16
-	mov	r6, r6, lsl #16
-	orr	r6, r6, r7, lsr #16
-	mov	r7, r7, lsl #16
-	orr	r7, r7, ip, lsr #16
-#else
 	orr	r4, r4, r5, lsl #16
 	mov	r5, r5, lsr #16
 	orr	r5, r5, r6, lsl #16
 	mov	r6, r6, lsr #16
 	orr	r6, r6, r7, lsl #16
 	mov	r7, r7, lsr #16
 	orr	r7, r7, ip, lsl #16
-#endif
 	str	r4, [r1], #0x04
 	str	r5, [r1], #0x04
 	str	r6, [r1], #0x04
 	str	r7, [r1], #0x04
 .Lcopyin_bad2:
 	subs	r2, r2, #0x10
 	bge	.Lcopyin_bad2_loop16
 
 	adds	r2, r2, #0x10
 	ldmfdeq	sp!, {r4-r7}
 	RETeq				/* Return now if done */
 	subs	r2, r2, #0x04
 	sublt	r0, r0, #0x02
 	blt	.Lcopyin_l4
 
 .Lcopyin_bad2_loop4:
-#ifdef __ARMEB__
-	mov	r4, ip, lsl #16
-#else
 	mov	r4, ip, lsr #16
-#endif
 	ldrt	ip, [r0], #0x04
 	subs	r2, r2, #0x04
-#ifdef __ARMEB__
-	orr	r4, r4, ip, lsr #16
-#else
 	orr	r4, r4, ip, lsl #16
-#endif
 	str	r4, [r1], #0x04
 	bge	.Lcopyin_bad2_loop4
 	sub	r0, r0, #0x02
 	b	.Lcopyin_l4
 
 .Lcopyin_bad3_loop16:
-#ifdef __ARMEB__
-	mov	r4, ip, lsl #24
-#else
 	mov	r4, ip, lsr #24
-#endif
 	ldrt	r5, [r0], #0x04
 	pld	[r0, #0x018]
 	ldrt	r6, [r0], #0x04
 	ldrt	r7, [r0], #0x04
 	ldrt	ip, [r0], #0x04
-#ifdef __ARMEB__
-	orr	r4, r4, r5, lsr #8
-	mov	r5, r5, lsl #24
-	orr	r5, r5, r6, lsr #8
-	mov	r6, r6, lsl #24
-	orr	r6, r6, r7, lsr #8
-	mov	r7, r7, lsl #24
-	orr	r7, r7, ip, lsr #8
-#else
 	orr	r4, r4, r5, lsl #8
 	mov	r5, r5, lsr #24
 	orr	r5, r5, r6, lsl #8
 	mov	r6, r6, lsr #24
 	orr	r6, r6, r7, lsl #8
 	mov	r7, r7, lsr #24
 	orr	r7, r7, ip, lsl #8
-#endif
 	str	r4, [r1], #0x04
 	str	r5, [r1], #0x04
 	str	r6, [r1], #0x04
 	str	r7, [r1], #0x04
 .Lcopyin_bad3:
 	subs	r2, r2, #0x10
 	bge	.Lcopyin_bad3_loop16
 
 	adds	r2, r2, #0x10
 	ldmfdeq	sp!, {r4-r7}
 	RETeq				/* Return now if done */
 	subs	r2, r2, #0x04
 	sublt	r0, r0, #0x01
 	blt	.Lcopyin_l4
 
 .Lcopyin_bad3_loop4:
-#ifdef __ARMEB__
-	mov	r4, ip, lsl #24
-#else
 	mov	r4, ip, lsr #24
-#endif
 	ldrt	ip, [r0], #0x04
 	subs	r2, r2, #0x04
-#ifdef __ARMEB__
-	orr	r4, r4, ip, lsr #8
-#else
 	orr	r4, r4, ip, lsl #8
-#endif
 	str	r4, [r1], #0x04
 	bge	.Lcopyin_bad3_loop4
 	sub	r0, r0, #0x01
 
 .Lcopyin_l4:
 	ldmfd	sp!, {r4-r7}
 	mov	r3, #0x00
 	adds	r2, r2, #0x04
 	RETeq
 .Lcopyin_l4_2:
 	rsbs	r2, r2, #0x03
 	addne	pc, pc, r2, lsl #3
 	nop
 	ldrbt	ip, [r0], #0x01
 	strb	ip, [r1], #0x01
 	ldrbt	ip, [r0], #0x01
 	strb	ip, [r1], #0x01
 	ldrbt	ip, [r0]
 	strb	ip, [r1]
 	RET
 END(copyin)
 
 /*
  * r0 = kernel space address
  * r1 = user space address
  * r2 = length
  *
  * Copies bytes from kernel space to user space
  */
 ENTRY(copyout)
 	cmp	r2, #0x00
 	movle	r0, #0x00
 	movle	pc, lr			/* Bail early if length is <= 0 */
 
 	adds	r3, r1, r2
 	movcs	r0, #EFAULT
 	RETc(cs)
 
 	ldr	r12, =(VM_MAXUSER_ADDRESS + 1)
 	cmp	r3, r12
 	movcs	r0, #EFAULT
 	RETc(cs)
 
 	ldr	r3, .L_arm_memcpy
 	ldr	r3, [r3]
 	cmp	r3, #0
 	beq	.Lnormale
 	ldr	r3, .L_min_memcpy_size
 	ldr	r3, [r3]
 	cmp	r2, r3
 	blt	.Lnormale
 	stmfd	sp!, {r0-r2, r4, lr}
 	mov     r3, r0
 	mov     r0, r1
 	mov     r1, r3
 	mov     r3, #1 /* DST_IS_USER */
 	ldr	r4, .L_arm_memcpy
 	mov	lr, pc
 	ldr	pc, [r4]
 	cmp     r0, #0
 	ldmfd   sp!, {r0-r2, r4, lr}
 	moveq	r0, #0
 	RETeq
 
 .Lnormale:
 	stmfd	sp!, {r10-r11, lr}
 
 	GET_PCB(r10)
 	ldr	r10, [r10]
 
 	mov	r3, #0x00
 	adr	ip, .Lcopyout_fault
 	ldr	r11, [r10, #PCB_ONFAULT]
 	str	ip, [r10, #PCB_ONFAULT]
 	bl	.Lcopyout_guts
 	str	r11, [r10, #PCB_ONFAULT]
 	mov	r0, #0x00
 	ldmfd	sp!, {r10-r11, pc}
 
 .Lcopyout_fault:
 	ldr	r0, =EFAULT
 	str	r11, [r10, #PCB_ONFAULT]
 	cmp	r3, #0x00
 	ldmfdgt	sp!, {r4-r7}		/* r3 > 0 Restore r4-r7 */
 	ldmfdlt	sp!, {r4-r9}		/* r3 < 0 Restore r4-r9 */
 	ldmfd	sp!, {r10-r11, pc}
 
 .Lcopyout_guts:
 	pld	[r0]
 	/* Word-align the destination buffer */
 	ands	ip, r1, #0x03		/* Already word aligned? */
 	beq	.Lcopyout_wordaligned	/* Yup */
 	rsb	ip, ip, #0x04
 	cmp	r2, ip			/* Enough bytes left to align it? */
 	blt	.Lcopyout_l4_2		/* Nope. Just copy bytewise */
 	sub	r2, r2, ip
 	rsbs	ip, ip, #0x03
 	addne	pc, pc, ip, lsl #3
 	nop
 	ldrb	ip, [r0], #0x01
 	strbt	ip, [r1], #0x01
 	ldrb	ip, [r0], #0x01
 	strbt	ip, [r1], #0x01
 	ldrb	ip, [r0], #0x01
 	strbt	ip, [r1], #0x01
 	cmp	r2, #0x00		/* All done? */
 	RETeq
 
 	/* Destination buffer is now word aligned */
 .Lcopyout_wordaligned:
 	ands	ip, r0, #0x03		/* Is src also word-aligned? */
 	bne	.Lcopyout_bad_align	/* Nope. Things just got bad */
 	cmp	r2, #0x08		/* Less than 8 bytes remaining? */
 	blt	.Lcopyout_w_less_than8
 
 	/* Quad-align the destination buffer */
 	tst	r0, #0x07		/* Already quad aligned? */
 	ldrne	ip, [r0], #0x04
 	subne	r2, r2, #0x04
 	strtne	ip, [r1], #0x04
 
 	stmfd	sp!, {r4-r9}		/* Free up some registers */
 	mov	r3, #-1			/* Signal restore r4-r9 */
 
 	/* Destination buffer word aligned, source is quad aligned */
 	subs	r2, r2, #0x80
 	blt	.Lcopyout_w_lessthan128
 
 	/* Copy 128 bytes at a time */
 .Lcopyout_w_loop128:
 	ldrd	r4, [r0], #0x08		/* LD:00-07 */
 	pld	[r0, #0x18]		/* Prefetch 0x20 */
 	ldrd	r6, [r0], #0x08		/* LD:08-0f */
 	ldrd	r8, [r0], #0x08		/* LD:10-17 */
 	strt	r4, [r1], #0x04		/* ST:00-03 */
 	strt	r5, [r1], #0x04		/* ST:04-07 */
 	ldrd	r4, [r0], #0x08		/* LD:18-1f */
 	strt	r6, [r1], #0x04		/* ST:08-0b */
 	strt	r7, [r1], #0x04		/* ST:0c-0f */
 	ldrd	r6, [r0], #0x08		/* LD:20-27 */
 	pld	[r0, #0x18]		/* Prefetch 0x40 */
 	strt	r8, [r1], #0x04		/* ST:10-13 */
 	strt	r9, [r1], #0x04		/* ST:14-17 */
 	ldrd	r8, [r0], #0x08		/* LD:28-2f */
 	strt	r4, [r1], #0x04		/* ST:18-1b */
 	strt	r5, [r1], #0x04		/* ST:1c-1f */
 	ldrd	r4, [r0], #0x08		/* LD:30-37 */
 	strt	r6, [r1], #0x04		/* ST:20-23 */
 	strt	r7, [r1], #0x04		/* ST:24-27 */
 	ldrd	r6, [r0], #0x08		/* LD:38-3f */
 	strt	r8, [r1], #0x04		/* ST:28-2b */
 	strt	r9, [r1], #0x04		/* ST:2c-2f */
 	ldrd	r8, [r0], #0x08		/* LD:40-47 */
 	pld	[r0, #0x18]		/* Prefetch 0x60 */
 	strt	r4, [r1], #0x04		/* ST:30-33 */
 	strt	r5, [r1], #0x04		/* ST:34-37 */
 	ldrd	r4, [r0], #0x08		/* LD:48-4f */
 	strt	r6, [r1], #0x04		/* ST:38-3b */
 	strt	r7, [r1], #0x04		/* ST:3c-3f */
 	ldrd	r6, [r0], #0x08		/* LD:50-57 */
 	strt	r8, [r1], #0x04		/* ST:40-43 */
 	strt	r9, [r1], #0x04		/* ST:44-47 */
 	ldrd	r8, [r0], #0x08		/* LD:58-4f */
 	strt	r4, [r1], #0x04		/* ST:48-4b */
 	strt	r5, [r1], #0x04		/* ST:4c-4f */
 	ldrd	r4, [r0], #0x08		/* LD:60-67 */
 	pld	[r0, #0x18]		/* Prefetch 0x80 */
 	strt	r6, [r1], #0x04		/* ST:50-53 */
 	strt	r7, [r1], #0x04		/* ST:54-57 */
 	ldrd	r6, [r0], #0x08		/* LD:68-6f */
 	strt	r8, [r1], #0x04		/* ST:58-5b */
 	strt	r9, [r1], #0x04		/* ST:5c-5f */
 	ldrd	r8, [r0], #0x08		/* LD:70-77 */
 	strt	r4, [r1], #0x04		/* ST:60-63 */
 	strt	r5, [r1], #0x04		/* ST:64-67 */
 	ldrd	r4, [r0], #0x08		/* LD:78-7f */
 	strt	r6, [r1], #0x04		/* ST:68-6b */
 	strt	r7, [r1], #0x04		/* ST:6c-6f */
 	strt	r8, [r1], #0x04		/* ST:70-73 */
 	strt	r9, [r1], #0x04		/* ST:74-77 */
 	subs	r2, r2, #0x80
 	strt	r4, [r1], #0x04		/* ST:78-7b */
 	strt	r5, [r1], #0x04		/* ST:7c-7f */
 	bge	.Lcopyout_w_loop128
 
 .Lcopyout_w_lessthan128:
 	adds	r2, r2, #0x80		/* Adjust for extra sub */
 	ldmfdeq	sp!, {r4-r9}
 	RETeq				/* Return now if done */
 	subs	r2, r2, #0x20
 	blt	.Lcopyout_w_lessthan32
 
 	/* Copy 32 bytes at a time */
 .Lcopyout_w_loop32:
 	ldrd	r4, [r0], #0x08
 	pld	[r0, #0x18]
 	ldrd	r6, [r0], #0x08
 	ldrd	r8, [r0], #0x08
 	strt	r4, [r1], #0x04
 	strt	r5, [r1], #0x04
 	ldrd	r4, [r0], #0x08
 	strt	r6, [r1], #0x04
 	strt	r7, [r1], #0x04
 	strt	r8, [r1], #0x04
 	strt	r9, [r1], #0x04
 	subs	r2, r2, #0x20
 	strt	r4, [r1], #0x04
 	strt	r5, [r1], #0x04
 	bge	.Lcopyout_w_loop32
 
 .Lcopyout_w_lessthan32:
 	adds	r2, r2, #0x20		/* Adjust for extra sub */
 	ldmfdeq	sp!, {r4-r9}
 	RETeq				/* Return now if done */
 
 	and	r4, r2, #0x18
 	rsb	r5, r4, #0x18
 	subs	r2, r2, r4
 	add	pc, pc, r5, lsl #1
 	nop
 
 	/* At least 24 bytes remaining */
 	ldrd	r4, [r0], #0x08
 	strt	r4, [r1], #0x04
 	strt	r5, [r1], #0x04
 	nop
 
 	/* At least 16 bytes remaining */
 	ldrd	r4, [r0], #0x08
 	strt	r4, [r1], #0x04
 	strt	r5, [r1], #0x04
 	nop
 
 	/* At least 8 bytes remaining */
 	ldrd	r4, [r0], #0x08
 	strt	r4, [r1], #0x04
 	strt	r5, [r1], #0x04
 	nop
 
 	/* Less than 8 bytes remaining */
 	ldmfd	sp!, {r4-r9}
 	RETeq				/* Return now if done */
 	mov	r3, #0x00
 
 .Lcopyout_w_less_than8:
 	subs	r2, r2, #0x04
 	ldrge	ip, [r0], #0x04
 	strtge	ip, [r1], #0x04
 	RETeq				/* Return now if done */
 	addlt	r2, r2, #0x04
 	ldrb	ip, [r0], #0x01
 	cmp	r2, #0x02
 	ldrbge	r2, [r0], #0x01
 	strbt	ip, [r1], #0x01
 	ldrbgt	ip, [r0]
 	strbtge	r2, [r1], #0x01
 	strbtgt	ip, [r1]
 	RET
 
 /*
  * At this point, it has not been possible to word align both buffers.
  * The destination buffer (r1) is word aligned, but the source buffer
  * (r0) is not.
  */
 .Lcopyout_bad_align:
 	stmfd	sp!, {r4-r7}
 	mov	r3, #0x01
 	bic	r0, r0, #0x03
 	cmp	ip, #2
 	ldr	ip, [r0], #0x04
 	bgt	.Lcopyout_bad3
 	beq	.Lcopyout_bad2
 	b	.Lcopyout_bad1
 
 .Lcopyout_bad1_loop16:
-#ifdef	__ARMEB__
-	mov	r4, ip, lsl #8
-#else
 	mov	r4, ip, lsr #8
-#endif
 	ldr	r5, [r0], #0x04
 	pld	[r0, #0x018]
 	ldr	r6, [r0], #0x04
 	ldr	r7, [r0], #0x04
 	ldr	ip, [r0], #0x04
-#ifdef	__ARMEB__
-	orr	r4, r4, r5, lsr #24
-	mov	r5, r5, lsl #8
-	orr	r5, r5, r6, lsr #24
-	mov	r6, r6, lsl #8
-	orr	r6, r6, r7, lsr #24
-	mov	r7, r7, lsl #8
-	orr	r7, r7, ip, lsr #24
-#else
 	orr	r4, r4, r5, lsl #24
 	mov	r5, r5, lsr #8
 	orr	r5, r5, r6, lsl #24
 	mov	r6, r6, lsr #8
 	orr	r6, r6, r7, lsl #24
 	mov	r7, r7, lsr #8
 	orr	r7, r7, ip, lsl #24
-#endif
 	strt	r4, [r1], #0x04
 	strt	r5, [r1], #0x04
 	strt	r6, [r1], #0x04
 	strt	r7, [r1], #0x04
 .Lcopyout_bad1:
 	subs	r2, r2, #0x10
 	bge	.Lcopyout_bad1_loop16
 
 	adds	r2, r2, #0x10
 	ldmfdeq	sp!, {r4-r7}
 	RETeq				/* Return now if done */
 	subs	r2, r2, #0x04
 	sublt	r0, r0, #0x03
 	blt	.Lcopyout_l4
 
 .Lcopyout_bad1_loop4:
-#ifdef __ARMEB__
-	mov	r4, ip, lsl #8
-#else
 	mov	r4, ip, lsr #8
-#endif
 	ldr	ip, [r0], #0x04
 	subs	r2, r2, #0x04
-#ifdef __ARMEB__
-	orr	r4, r4, ip, lsr #24
-#else
 	orr	r4, r4, ip, lsl #24
-#endif
 	strt	r4, [r1], #0x04
 	bge	.Lcopyout_bad1_loop4
 	sub	r0, r0, #0x03
 	b	.Lcopyout_l4
 
 .Lcopyout_bad2_loop16:
-#ifdef __ARMEB__
-	mov	r4, ip, lsl #16
-#else
 	mov	r4, ip, lsr #16
-#endif
 	ldr	r5, [r0], #0x04
 	pld	[r0, #0x018]
 	ldr	r6, [r0], #0x04
 	ldr	r7, [r0], #0x04
 	ldr	ip, [r0], #0x04
-#ifdef __ARMEB__
-	orr	r4, r4, r5, lsr #16
-	mov	r5, r5, lsl #16
-	orr	r5, r5, r6, lsr #16
-	mov	r6, r6, lsl #16
-	orr	r6, r6, r7, lsr #16
-	mov	r7, r7, lsl #16
-	orr	r7, r7, ip, lsr #16
-#else
 	orr	r4, r4, r5, lsl #16
 	mov	r5, r5, lsr #16
 	orr	r5, r5, r6, lsl #16
 	mov	r6, r6, lsr #16
 	orr	r6, r6, r7, lsl #16
 	mov	r7, r7, lsr #16
 	orr	r7, r7, ip, lsl #16
-#endif
 	strt	r4, [r1], #0x04
 	strt	r5, [r1], #0x04
 	strt	r6, [r1], #0x04
 	strt	r7, [r1], #0x04
 .Lcopyout_bad2:
 	subs	r2, r2, #0x10
 	bge	.Lcopyout_bad2_loop16
 
 	adds	r2, r2, #0x10
 	ldmfdeq	sp!, {r4-r7}
 	RETeq				/* Return now if done */
 	subs	r2, r2, #0x04
 	sublt	r0, r0, #0x02
 	blt	.Lcopyout_l4
 
 .Lcopyout_bad2_loop4:
-#ifdef __ARMEB__
-	mov	r4, ip, lsl #16
-#else
 	mov	r4, ip, lsr #16
-#endif
 	ldr	ip, [r0], #0x04
 	subs	r2, r2, #0x04
-#ifdef __ARMEB__
-	orr	r4, r4, ip, lsr #16
-#else
 	orr	r4, r4, ip, lsl #16
-#endif
 	strt	r4, [r1], #0x04
 	bge	.Lcopyout_bad2_loop4
 	sub	r0, r0, #0x02
 	b	.Lcopyout_l4
 
 .Lcopyout_bad3_loop16:
-#ifdef __ARMEB__
-	mov	r4, ip, lsl #24
-#else
 	mov	r4, ip, lsr #24
-#endif
 	ldr	r5, [r0], #0x04
 	pld	[r0, #0x018]
 	ldr	r6, [r0], #0x04
 	ldr	r7, [r0], #0x04
 	ldr	ip, [r0], #0x04
-#ifdef __ARMEB__
-	orr	r4, r4, r5, lsr #8
-	mov	r5, r5, lsl #24
-	orr	r5, r5, r6, lsr #8
-	mov	r6, r6, lsl #24
-	orr	r6, r6, r7, lsr #8
-	mov	r7, r7, lsl #24
-	orr	r7, r7, ip, lsr #8
-#else
 	orr	r4, r4, r5, lsl #8
 	mov	r5, r5, lsr #24
 	orr	r5, r5, r6, lsl #8
 	mov	r6, r6, lsr #24
 	orr	r6, r6, r7, lsl #8
 	mov	r7, r7, lsr #24
 	orr	r7, r7, ip, lsl #8
-#endif
 	strt	r4, [r1], #0x04
 	strt	r5, [r1], #0x04
 	strt	r6, [r1], #0x04
 	strt	r7, [r1], #0x04
 .Lcopyout_bad3:
 	subs	r2, r2, #0x10
 	bge	.Lcopyout_bad3_loop16
 
 	adds	r2, r2, #0x10
 	ldmfdeq	sp!, {r4-r7}
 	RETeq				/* Return now if done */
 	subs	r2, r2, #0x04
 	sublt	r0, r0, #0x01
 	blt	.Lcopyout_l4
 
 .Lcopyout_bad3_loop4:
-#ifdef __ARMEB__
-	mov	r4, ip, lsl #24
-#else
 	mov	r4, ip, lsr #24
-#endif
 	ldr	ip, [r0], #0x04
 	subs	r2, r2, #0x04
-#ifdef __ARMEB__
-	orr	r4, r4, ip, lsr #8
-#else
 	orr	r4, r4, ip, lsl #8
-#endif
 	strt	r4, [r1], #0x04
 	bge	.Lcopyout_bad3_loop4
 	sub	r0, r0, #0x01
 
 .Lcopyout_l4:
 	ldmfd	sp!, {r4-r7}
 	mov	r3, #0x00
 	adds	r2, r2, #0x04
 	RETeq
 .Lcopyout_l4_2:
 	rsbs	r2, r2, #0x03
 	addne	pc, pc, r2, lsl #3
 	nop
 	ldrb	ip, [r0], #0x01
 	strbt	ip, [r1], #0x01
 	ldrb	ip, [r0], #0x01
 	strbt	ip, [r1], #0x01
 	ldrb	ip, [r0]
 	strbt	ip, [r1]
 	RET
 END(copyout)
 
Index: head/sys/arm/arm/cpufunc.c
===================================================================
--- head/sys/arm/arm/cpufunc.c	(revision 368152)
+++ head/sys/arm/arm/cpufunc.c	(revision 368153)
@@ -1,589 +1,586 @@
 /*	$NetBSD: cpufunc.c,v 1.65 2003/11/05 12:53:15 scw Exp $	*/
 
 /*-
  * SPDX-License-Identifier: BSD-4-Clause
  *
  * arm9 support code Copyright (C) 2001 ARM Ltd
  * Copyright (c) 1997 Mark Brinicombe.
  * Copyright (c) 1997 Causality Limited
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by Causality Limited.
  * 4. The name of Causality Limited may not be used to endorse or promote
  *    products derived from this software without specific prior written
  *    permission.
  *
  * THIS SOFTWARE IS PROVIDED BY CAUSALITY LIMITED ``AS IS'' AND ANY EXPRESS
  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  * DISCLAIMED. IN NO EVENT SHALL CAUSALITY LIMITED BE LIABLE FOR ANY DIRECT,
  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * RiscBSD kernel project
  *
  * cpufuncs.c
  *
  * C functions for supporting CPU / MMU / TLB specific operations.
  *
  * Created      : 30/01/97
  */
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/bus.h>
 #include <machine/bus.h>
 #include <machine/cpu.h>
 #include <machine/disassem.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/uma.h>
 
 #include <machine/cpufunc.h>
 
 /* PRIMARY CACHE VARIABLES */
 int	arm_picache_size;
 int	arm_picache_line_size;
 int	arm_picache_ways;
 
 int	arm_pdcache_size;	/* and unified */
 int	arm_pdcache_line_size;
 int	arm_pdcache_ways;
 
 int	arm_pcache_type;
 int	arm_pcache_unified;
 
 int	arm_dcache_align;
 int	arm_dcache_align_mask;
 
 u_int	arm_cache_level;
 u_int	arm_cache_type[14];
 u_int	arm_cache_loc;
 
 #if defined(CPU_ARM9E)
 static void arm10_setup(void);
 #endif
 #ifdef CPU_MV_PJ4B
 static void pj4bv7_setup(void);
 #endif
 #if defined(CPU_ARM1176)
 static void arm11x6_setup(void);
 #endif
 #if defined(CPU_CORTEXA) || defined(CPU_KRAIT)
 static void cortexa_setup(void);
 #endif
 
 #if defined(CPU_ARM9E)
 struct cpu_functions armv5_ec_cpufuncs = {
 	/* CPU functions */
 
 	cpufunc_nullop,			/* cpwait		*/
 
 	/* MMU functions */
 
 	cpufunc_control,		/* control		*/
 	armv5_ec_setttb,		/* Setttb		*/
 
 	/* TLB functions */
 
 	armv4_tlb_flushID,		/* tlb_flushID		*/
 	arm9_tlb_flushID_SE,		/* tlb_flushID_SE	*/
 	armv4_tlb_flushD,		/* tlb_flushD		*/
 	armv4_tlb_flushD_SE,		/* tlb_flushD_SE	*/
 
 	/* Cache operations */
 
 	armv5_ec_icache_sync_range,	/* icache_sync_range	*/
 
 	armv5_ec_dcache_wbinv_all,	/* dcache_wbinv_all	*/
 	armv5_ec_dcache_wbinv_range,	/* dcache_wbinv_range	*/
 	armv5_ec_dcache_inv_range,	/* dcache_inv_range	*/
 	armv5_ec_dcache_wb_range,	/* dcache_wb_range	*/
 
 	armv4_idcache_inv_all,		/* idcache_inv_all	*/
 	armv5_ec_idcache_wbinv_all,	/* idcache_wbinv_all	*/
 	armv5_ec_idcache_wbinv_range,	/* idcache_wbinv_range	*/
 
 	cpufunc_nullop,                 /* l2cache_wbinv_all    */
 	(void *)cpufunc_nullop,         /* l2cache_wbinv_range  */
       	(void *)cpufunc_nullop,         /* l2cache_inv_range    */
 	(void *)cpufunc_nullop,         /* l2cache_wb_range     */
 	(void *)cpufunc_nullop,         /* l2cache_drain_writebuf */
 
 	/* Other functions */
 
 	armv4_drain_writebuf,		/* drain_writebuf	*/
 
 	(void *)cpufunc_nullop,		/* sleep		*/
 
 	/* Soft functions */
 
 	arm9_context_switch,		/* context_switch	*/
 
 	arm10_setup			/* cpu setup		*/
 
 };
 
 struct cpu_functions sheeva_cpufuncs = {
 	/* CPU functions */
 
 	cpufunc_nullop,			/* cpwait		*/
 
 	/* MMU functions */
 
 	cpufunc_control,		/* control		*/
 	sheeva_setttb,			/* Setttb		*/
 
 	/* TLB functions */
 
 	armv4_tlb_flushID,		/* tlb_flushID		*/
 	arm9_tlb_flushID_SE,		/* tlb_flushID_SE	*/
 	armv4_tlb_flushD,		/* tlb_flushD		*/
 	armv4_tlb_flushD_SE,		/* tlb_flushD_SE	*/
 
 	/* Cache operations */
 
 	armv5_ec_icache_sync_range,	/* icache_sync_range	*/
 
 	armv5_ec_dcache_wbinv_all,	/* dcache_wbinv_all	*/
 	sheeva_dcache_wbinv_range,	/* dcache_wbinv_range	*/
 	sheeva_dcache_inv_range,	/* dcache_inv_range	*/
 	sheeva_dcache_wb_range,		/* dcache_wb_range	*/
 
 	armv4_idcache_inv_all,		/* idcache_inv_all	*/
 	armv5_ec_idcache_wbinv_all,	/* idcache_wbinv_all	*/
 	sheeva_idcache_wbinv_range,	/* idcache_wbinv_all	*/
 
 	sheeva_l2cache_wbinv_all,	/* l2cache_wbinv_all    */
 	sheeva_l2cache_wbinv_range,	/* l2cache_wbinv_range  */
 	sheeva_l2cache_inv_range,	/* l2cache_inv_range    */
 	sheeva_l2cache_wb_range,	/* l2cache_wb_range     */
 	(void *)cpufunc_nullop,         /* l2cache_drain_writebuf */
 
 	/* Other functions */
 
 	armv4_drain_writebuf,		/* drain_writebuf	*/
 
 	sheeva_cpu_sleep,		/* sleep		*/
 
 	/* Soft functions */
 
 	arm9_context_switch,		/* context_switch	*/
 
 	arm10_setup			/* cpu setup		*/
 };
 #endif /* CPU_ARM9E */
 
 #ifdef CPU_MV_PJ4B
 struct cpu_functions pj4bv7_cpufuncs = {
 	/* Cache operations */
 	.cf_l2cache_wbinv_all = (void *)cpufunc_nullop,
 	.cf_l2cache_wbinv_range = (void *)cpufunc_nullop,
 	.cf_l2cache_inv_range = (void *)cpufunc_nullop,
 	.cf_l2cache_wb_range = (void *)cpufunc_nullop,
 	.cf_l2cache_drain_writebuf = (void *)cpufunc_nullop,
 
 	/* Other functions */
 	.cf_sleep = (void *)cpufunc_nullop,
 
 	/* Soft functions */
 	.cf_setup = pj4bv7_setup
 };
 #endif /* CPU_MV_PJ4B */
 
 #if defined(CPU_ARM1176)
 struct cpu_functions arm1176_cpufuncs = {
 	/* Cache operations */
 	.cf_l2cache_wbinv_all = (void *)cpufunc_nullop,
 	.cf_l2cache_wbinv_range = (void *)cpufunc_nullop,
 	.cf_l2cache_inv_range = (void *)cpufunc_nullop,
 	.cf_l2cache_wb_range = (void *)cpufunc_nullop,
 	.cf_l2cache_drain_writebuf = (void *)cpufunc_nullop,
 
 	/* Other functions */
 	.cf_sleep = arm11x6_sleep, 
 
 	/* Soft functions */
 	.cf_setup = arm11x6_setup
 };
 #endif /*CPU_ARM1176 */
 
 #if defined(CPU_CORTEXA) || defined(CPU_KRAIT)
 struct cpu_functions cortexa_cpufuncs = {
 	/* Cache operations */
 
 	/*
 	 * Note: For CPUs using the PL310 the L2 ops are filled in when the
 	 * L2 cache controller is actually enabled.
 	 */
 	.cf_l2cache_wbinv_all = cpufunc_nullop,
 	.cf_l2cache_wbinv_range = (void *)cpufunc_nullop,
 	.cf_l2cache_inv_range = (void *)cpufunc_nullop,
 	.cf_l2cache_wb_range = (void *)cpufunc_nullop,
 	.cf_l2cache_drain_writebuf = (void *)cpufunc_nullop,
 
 	/* Other functions */
 	.cf_sleep = armv7_cpu_sleep,
 
 	/* Soft functions */
 	.cf_setup = cortexa_setup
 };
 #endif /* CPU_CORTEXA || CPU_KRAIT */
 
 /*
  * Global constants also used by locore.s
  */
 
 struct cpu_functions cpufuncs;
 u_int cputype;
 
 #if defined (CPU_ARM9E) ||	\
   defined(CPU_ARM1176) ||	\
   defined(CPU_MV_PJ4B) ||			\
   defined(CPU_CORTEXA) || defined(CPU_KRAIT)
 
 static void get_cachetype_cp15(void);
 
 /* Additional cache information local to this file.  Log2 of some of the
    above numbers.  */
 static int	arm_dcache_l2_nsets;
 static int	arm_dcache_l2_assoc;
 static int	arm_dcache_l2_linesize;
 
 static void
 get_cachetype_cp15(void)
 {
 	u_int ctype, isize, dsize, cpuid;
 	u_int clevel, csize, i, sel;
 	u_int multiplier;
 	u_char type;
 
 	ctype = cp15_ctr_get();
 	cpuid = cp15_midr_get();
 	/*
 	 * ...and thus spake the ARM ARM:
 	 *
 	 * If an <opcode2> value corresponding to an unimplemented or
 	 * reserved ID register is encountered, the System Control
 	 * processor returns the value of the main ID register.
 	 */
 	if (ctype == cpuid)
 		goto out;
 
 	if (CPU_CT_FORMAT(ctype) == CPU_CT_ARMV7) {
 		__asm __volatile("mrc p15, 1, %0, c0, c0, 1"
 		    : "=r" (clevel));
 		arm_cache_level = clevel;
 		arm_cache_loc = CPU_CLIDR_LOC(arm_cache_level);
 		i = 0;
 		while ((type = (clevel & 0x7)) && i < 7) {
 			if (type == CACHE_DCACHE || type == CACHE_UNI_CACHE ||
 			    type == CACHE_SEP_CACHE) {
 				sel = i << 1;
 				__asm __volatile("mcr p15, 2, %0, c0, c0, 0"
 				    : : "r" (sel));
 				__asm __volatile("mrc p15, 1, %0, c0, c0, 0"
 				    : "=r" (csize));
 				arm_cache_type[sel] = csize;
 				arm_dcache_align = 1 <<
 				    (CPUV7_CT_xSIZE_LEN(csize) + 4);
 				arm_dcache_align_mask = arm_dcache_align - 1;
 			}
 			if (type == CACHE_ICACHE || type == CACHE_SEP_CACHE) {
 				sel = (i << 1) | 1;
 				__asm __volatile("mcr p15, 2, %0, c0, c0, 0"
 				    : : "r" (sel));
 				__asm __volatile("mrc p15, 1, %0, c0, c0, 0"
 				    : "=r" (csize));
 				arm_cache_type[sel] = csize;
 			}
 			i++;
 			clevel >>= 3;
 		}
 	} else {
 		if ((ctype & CPU_CT_S) == 0)
 			arm_pcache_unified = 1;
 
 		/*
 		 * If you want to know how this code works, go read the ARM ARM.
 		 */
 
 		arm_pcache_type = CPU_CT_CTYPE(ctype);
 
 		if (arm_pcache_unified == 0) {
 			isize = CPU_CT_ISIZE(ctype);
 			multiplier = (isize & CPU_CT_xSIZE_M) ? 3 : 2;
 			arm_picache_line_size = 1U << (CPU_CT_xSIZE_LEN(isize) + 3);
 			if (CPU_CT_xSIZE_ASSOC(isize) == 0) {
 				if (isize & CPU_CT_xSIZE_M)
 					arm_picache_line_size = 0; /* not present */
 				else
 					arm_picache_ways = 1;
 			} else {
 				arm_picache_ways = multiplier <<
 				    (CPU_CT_xSIZE_ASSOC(isize) - 1);
 			}
 			arm_picache_size = multiplier << (CPU_CT_xSIZE_SIZE(isize) + 8);
 		}
 
 		dsize = CPU_CT_DSIZE(ctype);
 		multiplier = (dsize & CPU_CT_xSIZE_M) ? 3 : 2;
 		arm_pdcache_line_size = 1U << (CPU_CT_xSIZE_LEN(dsize) + 3);
 		if (CPU_CT_xSIZE_ASSOC(dsize) == 0) {
 			if (dsize & CPU_CT_xSIZE_M)
 				arm_pdcache_line_size = 0; /* not present */
 			else
 				arm_pdcache_ways = 1;
 		} else {
 			arm_pdcache_ways = multiplier <<
 			    (CPU_CT_xSIZE_ASSOC(dsize) - 1);
 		}
 		arm_pdcache_size = multiplier << (CPU_CT_xSIZE_SIZE(dsize) + 8);
 
 		arm_dcache_align = arm_pdcache_line_size;
 
 		arm_dcache_l2_assoc = CPU_CT_xSIZE_ASSOC(dsize) + multiplier - 2;
 		arm_dcache_l2_linesize = CPU_CT_xSIZE_LEN(dsize) + 3;
 		arm_dcache_l2_nsets = 6 + CPU_CT_xSIZE_SIZE(dsize) -
 		    CPU_CT_xSIZE_ASSOC(dsize) - CPU_CT_xSIZE_LEN(dsize);
 
 	out:
 		arm_dcache_align_mask = arm_dcache_align - 1;
 	}
 }
 #endif /* ARM9 || XSCALE */
 
 /*
  * Cannot panic here as we may not have a console yet ...
  */
 
 int
 set_cpufuncs(void)
 {
 	cputype = cp15_midr_get();
 	cputype &= CPU_ID_CPU_MASK;
 
 #if defined(CPU_ARM9E)
 	if (cputype == CPU_ID_MV88FR131 || cputype == CPU_ID_MV88FR571_VD ||
 	    cputype == CPU_ID_MV88FR571_41) {
 		uint32_t sheeva_ctrl;
 
 		sheeva_ctrl = (MV_DC_STREAM_ENABLE | MV_BTB_DISABLE |
 		    MV_L2_ENABLE);
 		/*
 		 * Workaround for Marvell MV78100 CPU: Cache prefetch
 		 * mechanism may affect the cache coherency validity,
 		 * so it needs to be disabled.
 		 *
 		 * Refer to errata document MV-S501058-00C.pdf (p. 3.1
 		 * L2 Prefetching Mechanism) for details.
 		 */
 		if (cputype == CPU_ID_MV88FR571_VD ||
 		    cputype == CPU_ID_MV88FR571_41)
 			sheeva_ctrl |= MV_L2_PREFETCH_DISABLE;
 
 		sheeva_control_ext(0xffffffff & ~MV_WA_ENABLE, sheeva_ctrl);
 
 		cpufuncs = sheeva_cpufuncs;
 		get_cachetype_cp15();
 		pmap_pte_init_generic();
 		goto out;
 	} else if (cputype == CPU_ID_ARM926EJS) {
 		cpufuncs = armv5_ec_cpufuncs;
 		get_cachetype_cp15();
 		pmap_pte_init_generic();
 		goto out;
 	}
 #endif /* CPU_ARM9E */
 #if defined(CPU_ARM1176)
 	if (cputype == CPU_ID_ARM1176JZS) {
 		cpufuncs = arm1176_cpufuncs;
 		get_cachetype_cp15();
 		goto out;
 	}
 #endif /* CPU_ARM1176 */
 #if defined(CPU_CORTEXA) || defined(CPU_KRAIT)
 	switch(cputype & CPU_ID_SCHEME_MASK) {
 	case CPU_ID_CORTEXA5:
 	case CPU_ID_CORTEXA7:
 	case CPU_ID_CORTEXA8:
 	case CPU_ID_CORTEXA9:
 	case CPU_ID_CORTEXA12:
 	case CPU_ID_CORTEXA15:
 	case CPU_ID_CORTEXA53:
 	case CPU_ID_CORTEXA57:
 	case CPU_ID_CORTEXA72:
 	case CPU_ID_KRAIT300:
 		cpufuncs = cortexa_cpufuncs;
 		get_cachetype_cp15();
 		goto out;
 	default:
 		break;
 	}
 #endif /* CPU_CORTEXA || CPU_KRAIT */
 
 #if defined(CPU_MV_PJ4B)
 	if (cputype == CPU_ID_MV88SV581X_V7 ||
 	    cputype == CPU_ID_MV88SV584X_V7 ||
 	    cputype == CPU_ID_ARM_88SV581X_V7) {
 		cpufuncs = pj4bv7_cpufuncs;
 		get_cachetype_cp15();
 		goto out;
 	}
 #endif /* CPU_MV_PJ4B */
 
 	/*
 	 * Bzzzz. And the answer was ...
 	 */
 	panic("No support for this CPU type (%08x) in kernel", cputype);
 	return(ARCHITECTURE_NOT_PRESENT);
 out:
 	uma_set_align(arm_dcache_align_mask);
 	return (0);
 }
 
 /*
  * CPU Setup code
  */
 
 #if defined(CPU_ARM9E)
 static void
 arm10_setup(void)
 {
 	int cpuctrl, cpuctrlmask;
 
 	cpuctrl = CPU_CONTROL_MMU_ENABLE | CPU_CONTROL_SYST_ENABLE
 	    | CPU_CONTROL_IC_ENABLE | CPU_CONTROL_DC_ENABLE
 	    | CPU_CONTROL_WBUF_ENABLE | CPU_CONTROL_BPRD_ENABLE;
 	cpuctrlmask = CPU_CONTROL_MMU_ENABLE | CPU_CONTROL_SYST_ENABLE
 	    | CPU_CONTROL_IC_ENABLE | CPU_CONTROL_DC_ENABLE
 	    | CPU_CONTROL_WBUF_ENABLE | CPU_CONTROL_ROM_ENABLE
 	    | CPU_CONTROL_BEND_ENABLE | CPU_CONTROL_AFLT_ENABLE
 	    | CPU_CONTROL_BPRD_ENABLE
 	    | CPU_CONTROL_ROUNDROBIN | CPU_CONTROL_CPCLK;
 
 #ifndef ARM32_DISABLE_ALIGNMENT_FAULTS
 	cpuctrl |= CPU_CONTROL_AFLT_ENABLE;
 #endif
 
-#ifdef __ARMEB__
-	cpuctrl |= CPU_CONTROL_BEND_ENABLE;
-#endif
 
 	/* Clear out the cache */
 	cpu_idcache_wbinv_all();
 
 	/* Now really make sure they are clean.  */
 	__asm __volatile ("mcr\tp15, 0, r0, c7, c7, 0" : : );
 
 	if (vector_page == ARM_VECTORS_HIGH)
 		cpuctrl |= CPU_CONTROL_VECRELOC;
 
 	/* Set the control register */
 	cpu_control(0xffffffff, cpuctrl);
 
 	/* And again. */
 	cpu_idcache_wbinv_all();
 }
 #endif	/* CPU_ARM9E || CPU_ARM10 */
 
 #if defined(CPU_ARM1176) \
  || defined(CPU_MV_PJ4B) \
  || defined(CPU_CORTEXA) || defined(CPU_KRAIT)
 static __inline void
 cpu_scc_setup_ccnt(void)
 {
 /* This is how you give userland access to the CCNT and PMCn
  * registers.
  * BEWARE! This gives write access also, which may not be what
  * you want!
  */
 #ifdef _PMC_USER_READ_WRITE_
 	/* Set PMUSERENR[0] to allow userland access */
 	cp15_pmuserenr_set(1);
 #endif
 #if defined(CPU_ARM1176)
 	/* Set PMCR[2,0] to enable counters and reset CCNT */
 	cp15_pmcr_set(5);
 #else
 	/* Set up the PMCCNTR register as a cyclecounter:
 	 * Set PMINTENCLR to 0xFFFFFFFF to block interrupts
 	 * Set PMCR[2,0] to enable counters and reset CCNT
 	 * Set PMCNTENSET to 0x80000000 to enable CCNT */
 	cp15_pminten_clr(0xFFFFFFFF);
 	cp15_pmcr_set(5);
 	cp15_pmcnten_set(0x80000000);
 #endif
 }
 #endif
 
 #if defined(CPU_ARM1176)
 static void
 arm11x6_setup(void)
 {
 	uint32_t auxctrl, auxctrl_wax;
 	uint32_t tmp, tmp2;
 	uint32_t cpuid;
 
 	cpuid = cp15_midr_get();
 
 	auxctrl = 0;
 	auxctrl_wax = ~0;
 
 	/*
 	 * Enable an errata workaround
 	 */
 	if ((cpuid & CPU_ID_CPU_MASK) == CPU_ID_ARM1176JZS) { /* ARM1176JZSr0 */
 		auxctrl = ARM1176_AUXCTL_PHD;
 		auxctrl_wax = ~ARM1176_AUXCTL_PHD;
 	}
 
 	tmp = cp15_actlr_get();
 	tmp2 = tmp;
 	tmp &= auxctrl_wax;
 	tmp |= auxctrl;
 	if (tmp != tmp2)
 		cp15_actlr_set(tmp);
 
 	cpu_scc_setup_ccnt();
 }
 #endif  /* CPU_ARM1176 */
 
 #ifdef CPU_MV_PJ4B
 static void
 pj4bv7_setup(void)
 {
 
 	pj4b_config();
 	cpu_scc_setup_ccnt();
 }
 #endif /* CPU_MV_PJ4B */
 
 #if defined(CPU_CORTEXA) || defined(CPU_KRAIT)
 static void
 cortexa_setup(void)
 {
 
 	cpu_scc_setup_ccnt();
 }
 #endif  /* CPU_CORTEXA || CPU_KRAIT */
Index: head/sys/arm/arm/fusu.S
===================================================================
--- head/sys/arm/arm/fusu.S	(revision 368152)
+++ head/sys/arm/arm/fusu.S	(revision 368153)
@@ -1,313 +1,304 @@
 /*	$NetBSD: fusu.S,v 1.10 2003/12/01 13:34:44 rearnsha Exp $	*/
 
 /*-
  * Copyright (c) 1996-1998 Mark Brinicombe.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by Mark Brinicombe
  * 4. The name of the company nor the name of the author may be used to
  *    endorse or promote products derived from this software without specific
  *    prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
 #include <machine/asm.h>
 #include <machine/armreg.h>
 #include "assym.inc"
 __FBSDID("$FreeBSD$");
 
 	.syntax	unified
 
 #define GET_PCB(tmp) \
 	mrc p15, 0, tmp, c13, c0, 4; \
 	add	tmp, tmp, #(TD_PCB)
 
 /*
  * casueword32(volatile uint32_t *base, uint32_t oldval, uint32_t *oldvalp,
  *    uint32_t newval);
  */
 
 ENTRY(casueword)
 EENTRY_NP(casueword32)
 	stmfd	sp!, {r4, r5, r6}
 
 	ldr	r4, =(VM_MAXUSER_ADDRESS-3)
 	cmp	r0, r4
 	mvncs	r0, #0
 	bcs	1f
 
 	GET_PCB(r6)
 	ldr	r6, [r6]
 
 #ifdef DIAGNOSTIC
 	teq	r6, #0x00000000
 	ldmfdeq	sp!, {r4, r5, r6}
 	beq	.Lfusupcbfault
 #endif
 
 	adr	r4, .Lcasuwordfault
 	str	r4, [r6, #PCB_ONFAULT]
 
 	mov	r5, #1
 	ldrex	r4, [r0]
 	cmp	r4, r1
 	strexeq	r5, r3, [r0]
 	str	r4, [r2]
 	mov	r0, #0
 	str	r0, [r6, #PCB_ONFAULT]
 	mov	r0, r5
 1:
 	ldmfd	sp!, {r4, r5, r6}
 	RET
 EEND(casueword32)
 END(casueword)
 
 /*
  * Handle faults from casuword.  Clean up and return -1.
  */
 
 .Lcasuwordfault:
 	mov	r0, #0x00000000
 	str	r0, [r6, #PCB_ONFAULT]
 	mvn	r0, #0
 	ldmfd	sp!, {r4, r5, r6}
 	RET
 
 /*
  * fueword(caddr_t uaddr, long *val);
  * Fetch an int from the user's address space.
  */
 
 ENTRY(fueword)
 EENTRY_NP(fueword32)
 	ldr	r3, =(VM_MAXUSER_ADDRESS-3)
 	cmp	r0, r3
 	mvncs	r0, #0
 	RETc(cs)
 
 	GET_PCB(r2)
 	ldr	r2, [r2]
 
 #ifdef DIAGNOSTIC
 	teq	r2, #0x00000000
 	beq	.Lfusupcbfault
 #endif
 
 	adr	r3, .Lfusufault
 	str	r3, [r2, #PCB_ONFAULT]
 
 	ldrt	r3, [r0]
 	str	r3, [r1]
 
 	mov	r0, #0x00000000
 	str	r0, [r2, #PCB_ONFAULT]
 	RET
 EEND(fueword32)
 END(fueword)
 
 /*
  * fusword(caddr_t uaddr);
  * Fetch a short from the user's address space.
  */
 
 ENTRY(fusword)
 	ldr	r3, =(VM_MAXUSER_ADDRESS-1)
 	cmp	r0, r3
 	mvncs	r0, #0
 	RETc(cs)
 
 	GET_PCB(r2)
 	ldr	r2, [r2]
 
 #ifdef DIAGNOSTIC
 	teq	r2, #0x00000000
 	beq	.Lfusupcbfault
 #endif
 
 	adr	r1, .Lfusufault
 	str	r1, [r2, #PCB_ONFAULT]
 
 	ldrbt	r3, [r0], #1
 	ldrbt	ip, [r0]
-#ifdef __ARMEB__
-	orr	r0, ip, r3, asl #8
-#else
 	orr	r0, r3, ip, asl #8
-#endif
 	mov	r1, #0x00000000
 	str	r1, [r2, #PCB_ONFAULT]
 	RET
 END(fusword)
 
 /*
  * fubyte(caddr_t uaddr);
  * Fetch a byte from the user's address space.
  */
 
 ENTRY(fubyte)
 	ldr	r3, =VM_MAXUSER_ADDRESS
 	cmp	r0, r3
 	mvncs	r0, #0
 	RETc(cs)
 
 	GET_PCB(r2)
 	ldr	r2, [r2]
 
 #ifdef DIAGNOSTIC
 	teq	r2, #0x00000000
 	beq	.Lfusupcbfault
 #endif
 
 	adr	r1, .Lfusufault
 	str	r1, [r2, #PCB_ONFAULT]
 
 	ldrbt	r3, [r0]
 
 	mov	r1, #0x00000000
 	str	r1, [r2, #PCB_ONFAULT]
 	mov	r0, r3
 	RET
 END(fubyte)
 
 /*
  * Handle faults from [fs]u*().  Clean up and return -1.
  */
 
 .Lfusufault:
 	mov	r0, #0x00000000
 	str	r0, [r2, #PCB_ONFAULT]
 	mvn	r0, #0x00000000
 	RET
 
 #ifdef DIAGNOSTIC
 /*
  * Handle earlier faults from [fs]u*(), due to no pcb
  */
 
 .Lfusupcbfault:
 	mov	r1, r0
 	adr	r0, fusupcbfaulttext
 	b	_C_LABEL(panic)
 
 fusupcbfaulttext:
 	.asciz	"Yikes - no valid PCB during fusuxxx() addr=%08x\n"
 	.align	2
 #endif
 
 /*
  * suword(caddr_t uaddr, int x);
  * Store an int in the user's address space.
  */
 
 ENTRY(suword)
 EENTRY_NP(suword32)
 	ldr	r3, =(VM_MAXUSER_ADDRESS-3)
 	cmp	r0, r3
 	mvncs	r0, #0
 	RETc(cs)
 
 	GET_PCB(r2)
 	ldr	r2, [r2]
 
 #ifdef DIAGNOSTIC
 	teq	r2, #0x00000000
 	beq	.Lfusupcbfault
 #endif
 
 	adr	r3, .Lfusufault
 	str	r3, [r2, #PCB_ONFAULT]
 
 	strt	r1, [r0]
 
 	mov	r0, #0x00000000
 	str	r0, [r2, #PCB_ONFAULT]
 	RET
 EEND(suword32)
 END(suword)
 
 /*
  * susword(caddr_t uaddr, short x);
  * Store a short in the user's address space.
  */
 
 ENTRY(susword)
 	ldr	r3, =(VM_MAXUSER_ADDRESS-1)
 	cmp	r0, r3
 	mvncs	r0, #0
 	RETc(cs)
 
 	GET_PCB(r2)
 	ldr	r2, [r2]
 
 #ifdef DIAGNOSTIC
 	teq	r2, #0x00000000
 	beq	.Lfusupcbfault
 #endif
 
 	adr	r3, .Lfusufault
 	str	r3, [r2, #PCB_ONFAULT]
 
-#ifdef __ARMEB__
-	mov	ip, r1, lsr #8
-	strbt	ip, [r0], #1
-#else
 	strbt	r1, [r0], #1
 	mov	r1, r1, lsr #8
-#endif
 	strbt	r1, [r0]
 
 	mov	r0, #0x00000000
 	str	r0, [r2, #PCB_ONFAULT]
 	RET
 END(susword)
 
 /*
  * subyte(caddr_t uaddr, char x);
  * Store a byte in the user's address space.
  */
 
 ENTRY(subyte)
 	ldr	r3, =VM_MAXUSER_ADDRESS
 	cmp	r0, r3
 	mvncs	r0, #0
 	RETc(cs)
 
 	GET_PCB(r2)
 	ldr	r2, [r2]
 
 
 #ifdef DIAGNOSTIC
 	teq	r2, #0x00000000
 	beq	.Lfusupcbfault
 #endif
 
 	adr	r3, .Lfusufault
 	str	r3, [r2, #PCB_ONFAULT]
 
 	strbt	r1, [r0]
 	mov	r0, #0x00000000
 	str	r0, [r2, #PCB_ONFAULT]
 	RET
 END(subyte)
Index: head/sys/arm/arm/in_cksum_arm.S
===================================================================
--- head/sys/arm/arm/in_cksum_arm.S	(revision 368152)
+++ head/sys/arm/arm/in_cksum_arm.S	(revision 368153)
@@ -1,344 +1,331 @@
 /*	$NetBSD: in_cksum_arm.S,v 1.2 2003/09/23 10:01:36 scw Exp $	*/
 
 /*-
  * Copyright 2003 Wasabi Systems, Inc.
  * All rights reserved.
  *
  * Written by Steve C. Woodford for Wasabi Systems, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *      This product includes software developed for the NetBSD Project by
  *      Wasabi Systems, Inc.
  * 4. The name of Wasabi Systems, Inc. may not be used to endorse
  *    or promote products derived from this software without specific prior
  *    written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  *
  */
 
 /*
  * Hand-optimised in_cksum() and in4_cksum() implementations for ARM/armv5e
  */
 
 #include "opt_inet.h"
 
 #include <machine/asm.h>
 #include "assym.inc"
 __FBSDID("$FreeBSD$");
 
 	.syntax	unified
 /*
  * int in_cksum(struct mbuf *m, int len)
  *
  * Entry:
  *	r0	m
  *	r1	len
  *
  * NOTE: Assumes 'm' is *never* NULL.
  */
 /* LINTSTUB: Func: int in_cksum(struct mbuf *, int) */
 ENTRY(in_cksum)
 	stmfd	sp!, {r4-r11,lr}
 	mov	r8, #0x00
 	mov	r9, r1
 	mov	r10, #0x00
 	mov	ip, r0
 
 .Lin_cksum_loop:
 	ldr	r1, [ip, #(M_LEN)]
 	ldr	r0, [ip, #(M_DATA)]
 	ldr	ip, [ip, #(M_NEXT)]
 .Lin_cksum_entry4:
 	cmp	r9, r1
 	movlt	r1, r9
 	sub	r9, r9, r1
 	eor	r11, r10, r0
 	add	r10, r10, r1
 	adds	r2, r1, #0x00
 	blne	_ASM_LABEL(L_cksumdata)
 	tst	r11, #0x01
 	movne	r2, r2, ror #8
 	adds	r8, r8, r2
 	adc	r8, r8, #0x00
 	cmp	ip, #0x00
 	bne	.Lin_cksum_loop
 
 	mov	r1, #0xff
 	orr	r1, r1, #0xff00
 	and	r0, r8, r1
 	add	r0, r0, r8, lsr #16
 	add	r0, r0, r0, lsr #16
 	and	r0, r0, r1
 	eor	r0, r0, r1
 	ldmfd	sp!, {r4-r11,pc}
 END(in_cksum)
 
 ENTRY(do_cksum)
 	stmfd	sp!, {r4-r7, lr}
 	bl	L_cksumdata
 	mov	r0, r2
 	ldmfd	sp!, {r4-r7, pc}
 END(do_cksum)
 
 /*
  * The main in*_cksum() workhorse...
  *
  * Entry parameters:
  *	r0	Pointer to buffer
  *	r1	Buffer length
  *	lr	Return address
  *
  * Returns:
  *	r2	Accumulated 32-bit sum
  *
  * Clobbers:
  *	r0-r7
  */
 /* LINTSTUB: Ignore */
 ASENTRY_NP(L_cksumdata)
 #ifdef _ARM_ARCH_5E
 	pld	[r0]			/* Pre-fetch the start of the buffer */
 #endif
 	mov	r2, #0
 
 	/* We first have to word-align the buffer.  */
 	ands	r7, r0, #0x03
 	beq	.Lcksumdata_wordaligned
 	rsb	r7, r7, #0x04
 	cmp	r1, r7			/* Enough bytes left to make it? */
 	blt	.Lcksumdata_endgame
 	cmp	r7, #0x02
 	ldrb	r4, [r0], #0x01		/* Fetch 1st byte */
 	ldrbge	r5, [r0], #0x01		/* Fetch 2nd byte */
 	movlt	r5, #0x00
 	ldrbgt	r6, [r0], #0x01		/* Fetch 3rd byte */
 	movle	r6, #0x00
+
 	/* Combine the three bytes depending on endianness and alignment */
-#ifdef __ARMEB__
-	orreq	r2, r5, r4, lsl #8
-	orreq	r2, r2, r6, lsl #24
-	orrne	r2, r4, r5, lsl #8
-	orrne	r2, r2, r6, lsl #16
-#else
 	orreq	r2, r4, r5, lsl #8
 	orreq	r2, r2, r6, lsl #16
 	orrne	r2, r5, r4, lsl #8
 	orrne	r2, r2, r6, lsl #24
-#endif
 	subs	r1, r1, r7		/* Update length */
-	RETeq			/* All done? */
+	RETeq				/* All done? */
 
 	/* Buffer is now word aligned */
 .Lcksumdata_wordaligned:
 #ifdef _ARM_ARCH_5E
 	cmp	r1, #0x04		/* Less than 4 bytes left? */
 	blt	.Lcksumdata_endgame	/* Yup */
 
 	/* Now quad-align, if necessary */
 	ands	r7, r0, #0x04
 	ldrne	r7, [r0], #0x04
 	subne	r1, r1, #0x04
 	subs	r1, r1, #0x40
 	blt	.Lcksumdata_bigloop_end	/* Note: C flag clear if branch taken */
 
 	/*
 	 * Buffer is now quad aligned. Sum 64 bytes at a time.
 	 * Note: First ldrd is hoisted above the loop, together with
 	 * setting r6 to zero to avoid stalling for results in the
 	 * loop. (r7 is live, from above).
 	 */
 	ldrd	r4, [r0], #0x08
 	mov	r6, #0x00
 .Lcksumdata_bigloop:
 	pld	[r0, #0x18]
 	adds	r2, r2, r6
 	adcs	r2, r2, r7
 	ldrd	r6, [r0], #0x08
 	adcs	r2, r2, r4
 	adcs	r2, r2, r5
 	ldrd	r4, [r0], #0x08
 	adcs	r2, r2, r6
 	adcs	r2, r2, r7
 	ldrd	r6, [r0], #0x08
 	adcs	r2, r2, r4
 	adcs	r2, r2, r5
 	ldrd	r4, [r0], #0x08
 	adcs	r2, r2, r6
 	adcs	r2, r2, r7
 	pld	[r0, #0x18]
 	ldrd	r6, [r0], #0x08
 	adcs	r2, r2, r4
 	adcs	r2, r2, r5
 	ldrd	r4, [r0], #0x08
 	adcs	r2, r2, r6
 	adcs	r2, r2, r7
 	ldrd	r6, [r0], #0x08
 	adcs	r2, r2, r4
 	adcs	r2, r2, r5
 	adc	r2, r2, #0x00
 	subs	r1, r1, #0x40
 	ldrdge	r4, [r0], #0x08
 	bge	.Lcksumdata_bigloop
 
 	adds	r2, r2, r6		/* r6/r7 still need summing */
 .Lcksumdata_bigloop_end:
 	adcs	r2, r2, r7
 	adc	r2, r2, #0x00
 
 #else	/* !_ARM_ARCH_5E */
 
 	subs	r1, r1, #0x40
 	blt	.Lcksumdata_bigloop_end
 
 .Lcksumdata_bigloop:
 	ldmia	r0!, {r3, r4, r5, r6}
 	adds	r2, r2, r3
 	adcs	r2, r2, r4
 	adcs	r2, r2, r5
 	ldmia	r0!, {r3, r4, r5, r7}
 	adcs	r2, r2, r6
 	adcs	r2, r2, r3
 	adcs	r2, r2, r4
 	adcs	r2, r2, r5
 	ldmia	r0!, {r3, r4, r5, r6}
 	adcs	r2, r2, r7
 	adcs	r2, r2, r3
 	adcs	r2, r2, r4
 	adcs	r2, r2, r5
 	ldmia	r0!, {r3, r4, r5, r7}
 	adcs	r2, r2, r6
 	adcs	r2, r2, r3
 	adcs	r2, r2, r4
 	adcs	r2, r2, r5
 	adcs	r2, r2, r7
 	adc	r2, r2, #0x00
 	subs	r1, r1, #0x40
 	bge	.Lcksumdata_bigloop
 .Lcksumdata_bigloop_end:
 #endif
 
 	adds	r1, r1, #0x40
 	RETeq
 	cmp	r1, #0x20
 
 #ifdef _ARM_ARCH_5E
 	ldrdge	r4, [r0], #0x08		/* Avoid stalling pld and result */
 	blt	.Lcksumdata_less_than_32
 	pld	[r0, #0x18]
 	ldrd	r6, [r0], #0x08
 	adds	r2, r2, r4
 	adcs	r2, r2, r5
 	ldrd	r4, [r0], #0x08
 	adcs	r2, r2, r6
 	adcs	r2, r2, r7
 	ldrd	r6, [r0], #0x08
 	adcs	r2, r2, r4
 	adcs	r2, r2, r5
 	adcs	r2, r2, r6		/* XXX: Unavoidable result stall */
 	adcs	r2, r2, r7
 #else
 	blt	.Lcksumdata_less_than_32
 	ldmia	r0!, {r3, r4, r5, r6}
 	adds	r2, r2, r3
 	adcs	r2, r2, r4
 	adcs	r2, r2, r5
 	ldmia	r0!, {r3, r4, r5, r7}
 	adcs	r2, r2, r6
 	adcs	r2, r2, r3
 	adcs	r2, r2, r4
 	adcs	r2, r2, r5
 	adcs	r2, r2, r7
 #endif
 	adc	r2, r2, #0x00
 	subs	r1, r1, #0x20
 	RETeq
 
 .Lcksumdata_less_than_32:
 	/* There are less than 32 bytes left */
 	and	r3, r1, #0x18
 	rsb	r4, r3, #0x18
 	sub	r1, r1, r3
 	adds	r4, r4, r4, lsr #1	/* Side effect: Clear carry flag */
 	addne	pc, pc, r4
 	nop
 
 /*
  * Note: We use ldm here, even on armv5e, since the combined issue/result
  * latencies for ldm and ldrd are the same. Using ldm avoids needless #ifdefs.
  */
 	/* At least 24 bytes remaining... */
 	ldmia	r0!, {r4, r5}
 	adcs	r2, r2, r4
 	adcs	r2, r2, r5
 
 	/* At least 16 bytes remaining... */
 	ldmia	r0!, {r4, r5}
 	adcs	r2, r2, r4
 	adcs	r2, r2, r5
 
 	/* At least 8 bytes remaining... */
 	ldmia	r0!, {r4, r5}
 	adcs	r2, r2, r4
 	adcs	r2, r2, r5
 
 	/* Less than 8 bytes remaining... */
 	adc	r2, r2, #0x00
 	subs	r1, r1, #0x04
 	blt	.Lcksumdata_lessthan4
 
 	ldr	r4, [r0], #0x04
 	sub	r1, r1, #0x04
 	adds	r2, r2, r4
 	adc	r2, r2, #0x00
 
 	/* Deal with < 4 bytes remaining */
 .Lcksumdata_lessthan4:
 	adds	r1, r1, #0x04
 	RETeq
 
 	/* Deal with 1 to 3 remaining bytes, possibly misaligned */
 .Lcksumdata_endgame:
 	ldrb	r3, [r0]		/* Fetch first byte */
 	cmp	r1, #0x02
 	ldrbge	r4, [r0, #0x01]		/* Fetch 2nd and 3rd as necessary */
 	movlt	r4, #0x00
 	ldrbgt	r5, [r0, #0x02]
 	movle	r5, #0x00
 	/* Combine the three bytes depending on endianness and alignment */
 	tst	r0, #0x01
-#ifdef __ARMEB__
-	orreq	r3, r4, r3, lsl #8
-	orreq	r3, r3, r5, lsl #24
-	orrne	r3, r3, r4, lsl #8
-	orrne	r3, r3, r5, lsl #16
-#else
 	orreq	r3, r3, r4, lsl #8
 	orreq	r3, r3, r5, lsl #16
 	orrne	r3, r4, r3, lsl #8
 	orrne	r3, r3, r5, lsl #24
-#endif
 	adds	r2, r2, r3
 	adc	r2, r2, #0x00
 	RET
 END(L_cksumdata)
 
Index: head/sys/arm/arm/support.S
===================================================================
--- head/sys/arm/arm/support.S	(revision 368152)
+++ head/sys/arm/arm/support.S	(revision 368153)
@@ -1,2965 +1,2420 @@
 /*-
  * Copyright (c) 2004 Olivier Houchard
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 /*
  * Copyright 2003 Wasabi Systems, Inc.
  * All rights reserved.
  *
  * Written by Steve C. Woodford for Wasabi Systems, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *      This product includes software developed for the NetBSD Project by
  *      Wasabi Systems, Inc.
  * 4. The name of Wasabi Systems, Inc. may not be used to endorse
  *    or promote products derived from this software without specific prior
  *    written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 /*
  * Copyright (c) 1997 The NetBSD Foundation, Inc.
  * All rights reserved.
  *
  * This code is derived from software contributed to The NetBSD Foundation
  * by Neil A. Carson and Mark Brinicombe
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <machine/asm.h>
 __FBSDID("$FreeBSD$");
 
 #include "assym.inc"
 
 	.syntax	unified
 
 .L_arm_memcpy:
 	.word	_C_LABEL(_arm_memcpy)
 .L_arm_bzero:
 	.word	_C_LABEL(_arm_bzero)
 .L_min_memcpy_size:
 	.word	_C_LABEL(_min_memcpy_size)
 .L_min_bzero_size:
 	.word	_C_LABEL(_min_bzero_size)
 /*
  * memset: Sets a block of memory to the specified value
  *
  * On entry:
  *   r0 - dest address
  *   r1 - byte to write
  *   r2 - number of bytes to write
  *
  * On exit:
  *   r0 - dest address
  */
 /* LINTSTUB: Func: void bzero(void *, size_t) */
 ENTRY(bzero)
 	ldr	r3, .L_arm_bzero
 	ldr	r3, [r3]
 	cmp	r3, #0
 	beq	.Lnormal0
 	ldr	r2, .L_min_bzero_size
 	ldr	r2, [r2]
 	cmp	r1, r2
 	blt	.Lnormal0
 	stmfd	sp!, {r0, r1, lr}
 	mov	r2, #0
 	mov	lr, pc
 	mov	pc, r3
 	cmp	r0, #0
 	ldmfd	sp!, {r0, r1, lr}
 	RETeq
 .Lnormal0:
 	mov	r3, #0x00
 	b	do_memset
 END(bzero)
 /* LINTSTUB: Func: void *memset(void *, int, size_t) */
 ENTRY(memset)
 	and	r3, r1, #0xff		/* We deal with bytes */
 	mov	r1, r2
 do_memset:
 	cmp	r1, #0x04		/* Do we have less than 4 bytes */
 	mov	ip, r0
 	blt	.Lmemset_lessthanfour
 
 	/* Ok first we will word align the address */
 	ands	r2, ip, #0x03		/* Get the bottom two bits */
 	bne	.Lmemset_wordunaligned	/* The address is not word aligned */
 
 	/* We are now word aligned */
 .Lmemset_wordaligned:
 	orr	r3, r3, r3, lsl #8	/* Extend value to 16-bits */
 #ifdef _ARM_ARCH_5E
 	tst	ip, #0x04		/* Quad-align for armv5e */
 #else
 	cmp	r1, #0x10
 #endif
 	orr	r3, r3, r3, lsl #16	/* Extend value to 32-bits */
 #ifdef _ARM_ARCH_5E
 	subne	r1, r1, #0x04		/* Quad-align if necessary */
 	strne	r3, [ip], #0x04
 	cmp	r1, #0x10
 #endif
 	blt	.Lmemset_loop4		/* If less than 16 then use words */
 	mov	r2, r3			/* Duplicate data */
 	cmp	r1, #0x80		/* If < 128 then skip the big loop */
 	blt	.Lmemset_loop32
 
 	/* Do 128 bytes at a time */
 .Lmemset_loop128:
 	subs	r1, r1, #0x80
 #ifdef _ARM_ARCH_5E
 	strdge	r2, [ip], #0x08
 	strdge	r2, [ip], #0x08
 	strdge	r2, [ip], #0x08
 	strdge	r2, [ip], #0x08
 	strdge	r2, [ip], #0x08
 	strdge	r2, [ip], #0x08
 	strdge	r2, [ip], #0x08
 	strdge	r2, [ip], #0x08
 	strdge	r2, [ip], #0x08
 	strdge	r2, [ip], #0x08
 	strdge	r2, [ip], #0x08
 	strdge	r2, [ip], #0x08
 	strdge	r2, [ip], #0x08
 	strdge	r2, [ip], #0x08
 	strdge	r2, [ip], #0x08
 	strdge	r2, [ip], #0x08
 #else
 	stmiage	ip!, {r2-r3}
 	stmiage	ip!, {r2-r3}
 	stmiage	ip!, {r2-r3}
 	stmiage	ip!, {r2-r3}
 	stmiage	ip!, {r2-r3}
 	stmiage	ip!, {r2-r3}
 	stmiage	ip!, {r2-r3}
 	stmiage	ip!, {r2-r3}
 	stmiage	ip!, {r2-r3}
 	stmiage	ip!, {r2-r3}
 	stmiage	ip!, {r2-r3}
 	stmiage	ip!, {r2-r3}
 	stmiage	ip!, {r2-r3}
 	stmiage	ip!, {r2-r3}
 	stmiage	ip!, {r2-r3}
 	stmiage	ip!, {r2-r3}
 #endif
 	bgt	.Lmemset_loop128
 	RETeq			/* Zero length so just exit */
 
 	add	r1, r1, #0x80		/* Adjust for extra sub */
 
 	/* Do 32 bytes at a time */
 .Lmemset_loop32:
 	subs	r1, r1, #0x20
 #ifdef _ARM_ARCH_5E
 	strdge	r2, [ip], #0x08
 	strdge	r2, [ip], #0x08
 	strdge	r2, [ip], #0x08
 	strdge	r2, [ip], #0x08
 #else
 	stmiage	ip!, {r2-r3}
 	stmiage	ip!, {r2-r3}
 	stmiage	ip!, {r2-r3}
 	stmiage	ip!, {r2-r3}
 #endif
 	bgt	.Lmemset_loop32
 	RETeq			/* Zero length so just exit */
 
 	adds	r1, r1, #0x10		/* Partially adjust for extra sub */
 
 	/* Deal with 16 bytes or more */
 #ifdef _ARM_ARCH_5E
 	strdge	r2, [ip], #0x08
 	strdge	r2, [ip], #0x08
 #else
 	stmiage	ip!, {r2-r3}
 	stmiage	ip!, {r2-r3}
 #endif
 	RETeq			/* Zero length so just exit */
 
 	addlt	r1, r1, #0x10		/* Possibly adjust for extra sub */
 
 	/* We have at least 4 bytes so copy as words */
 .Lmemset_loop4:
 	subs	r1, r1, #0x04
 	strge	r3, [ip], #0x04
 	bgt	.Lmemset_loop4
 	RETeq			/* Zero length so just exit */
 
 #ifdef _ARM_ARCH_5E
 	/* Compensate for 64-bit alignment check */
 	adds	r1, r1, #0x04
 	RETeq
 	cmp	r1, #2
 #else
 	cmp	r1, #-2
 #endif
 
 	strb	r3, [ip], #0x01		/* Set 1 byte */
 	strbge	r3, [ip], #0x01		/* Set another byte */
 	strbgt	r3, [ip]		/* and a third */
 	RET			/* Exit */
 
 .Lmemset_wordunaligned:
 	rsb	r2, r2, #0x004
 	strb	r3, [ip], #0x01		/* Set 1 byte */
 	cmp	r2, #0x02
 	strbge	r3, [ip], #0x01		/* Set another byte */
 	sub	r1, r1, r2
 	strbgt	r3, [ip], #0x01		/* and a third */
 	cmp	r1, #0x04		/* More than 4 bytes left? */
 	bge	.Lmemset_wordaligned	/* Yup */
 
 .Lmemset_lessthanfour:
 	cmp	r1, #0x00
 	RETeq			/* Zero length so exit */
 	strb	r3, [ip], #0x01		/* Set 1 byte */
 	cmp	r1, #0x02
 	strbge	r3, [ip], #0x01		/* Set another byte */
 	strbgt	r3, [ip]		/* and a third */
 	RET			/* Exit */
 EEND(memset)
 END(bzero)
 
 ENTRY(bcmp)
 	mov	ip, r0
 	cmp	r2, #0x06
 	beq	.Lmemcmp_6bytes
 	mov	r0, #0x00
 
 	/* Are both addresses aligned the same way? */
 	cmp	r2, #0x00
 	eorsne	r3, ip, r1
 	RETeq			/* len == 0, or same addresses! */
 	tst	r3, #0x03
 	subne	r2, r2, #0x01
 	bne	.Lmemcmp_bytewise2	/* Badly aligned. Do it the slow way */
 
 	/* Word-align the addresses, if necessary */
 	sub	r3, r1, #0x05
 	ands	r3, r3, #0x03
 	add	r3, r3, r3, lsl #1
 	addne	pc, pc, r3, lsl #3
 	nop
 
 	/* Compare up to 3 bytes */
 	ldrb	r0, [ip], #0x01
 	ldrb	r3, [r1], #0x01
 	subs	r0, r0, r3
 	RETne
 	subs	r2, r2, #0x01
 	RETeq
 
 	/* Compare up to 2 bytes */
 	ldrb	r0, [ip], #0x01
 	ldrb	r3, [r1], #0x01
 	subs	r0, r0, r3
 	RETne
 	subs	r2, r2, #0x01
 	RETeq
 
 	/* Compare 1 byte */
 	ldrb	r0, [ip], #0x01
 	ldrb	r3, [r1], #0x01
 	subs	r0, r0, r3
 	RETne
 	subs	r2, r2, #0x01
 	RETeq
 
 	/* Compare 4 bytes at a time, if possible */
 	subs	r2, r2, #0x04
 	bcc	.Lmemcmp_bytewise
 .Lmemcmp_word_aligned:
 	ldr	r0, [ip], #0x04
 	ldr	r3, [r1], #0x04
 	subs	r2, r2, #0x04
 	cmpcs	r0, r3
 	beq	.Lmemcmp_word_aligned
 	sub	r0, r0, r3
 
 	/* Correct for extra subtraction, and check if done */
 	adds	r2, r2, #0x04
 	cmpeq	r0, #0x00		/* If done, did all bytes match? */
 	RETeq			/* Yup. Just return */
 
 	/* Re-do the final word byte-wise */
 	sub	ip, ip, #0x04
 	sub	r1, r1, #0x04
 
 .Lmemcmp_bytewise:
 	add	r2, r2, #0x03
 .Lmemcmp_bytewise2:
 	ldrb	r0, [ip], #0x01
 	ldrb	r3, [r1], #0x01
 	subs	r2, r2, #0x01
 	cmpcs	r0, r3
 	beq	.Lmemcmp_bytewise2
 	sub	r0, r0, r3
 	RET
 
 	/*
 	 * 6 byte compares are very common, thanks to the network stack.
 	 * This code is hand-scheduled to reduce the number of stalls for
 	 * load results. Everything else being equal, this will be ~32%
 	 * faster than a byte-wise memcmp.
 	 */
 	.align	5
 .Lmemcmp_6bytes:
 	ldrb	r3, [r1, #0x00]		/* r3 = b2#0 */
 	ldrb	r0, [ip, #0x00]		/* r0 = b1#0 */
 	ldrb	r2, [r1, #0x01]		/* r2 = b2#1 */
 	subs	r0, r0, r3		/* r0 = b1#0 - b2#0 */
 	ldrbeq	r3, [ip, #0x01]		/* r3 = b1#1 */
 	RETne			/* Return if mismatch on #0 */
 	subs	r0, r3, r2		/* r0 = b1#1 - b2#1 */
 	ldrbeq	r3, [r1, #0x02]		/* r3 = b2#2 */
 	ldrbeq	r0, [ip, #0x02]		/* r0 = b1#2 */
 	RETne			/* Return if mismatch on #1 */
 	ldrb	r2, [r1, #0x03]		/* r2 = b2#3 */
 	subs	r0, r0, r3		/* r0 = b1#2 - b2#2 */
 	ldrbeq	r3, [ip, #0x03]		/* r3 = b1#3 */
 	RETne			/* Return if mismatch on #2 */
 	subs	r0, r3, r2		/* r0 = b1#3 - b2#3 */
 	ldrbeq	r3, [r1, #0x04]		/* r3 = b2#4 */
 	ldrbeq	r0, [ip, #0x04]		/* r0 = b1#4 */
 	RETne			/* Return if mismatch on #3 */
 	ldrb	r2, [r1, #0x05]		/* r2 = b2#5 */
 	subs	r0, r0, r3		/* r0 = b1#4 - b2#4 */
 	ldrbeq	r3, [ip, #0x05]		/* r3 = b1#5 */
 	RETne			/* Return if mismatch on #4 */
 	sub	r0, r3, r2		/* r0 = b1#5 - b2#5 */
 	RET
 END(bcmp)
 
 ENTRY(bcopy)
 	/* switch the source and destination registers */
 	eor     r0, r1, r0
 	eor     r1, r0, r1
 	eor     r0, r1, r0
 EENTRY(memmove)
 	/* Do the buffers overlap? */
 	cmp	r0, r1
 	RETeq		/* Bail now if src/dst are the same */
 	subcc	r3, r0, r1	/* if (dst > src) r3 = dst - src */
 	subcs	r3, r1, r0	/* if (src > dsr) r3 = src - dst */
 	cmp	r3, r2		/* if (r3 < len) we have an overlap */
 	bcc	PIC_SYM(_C_LABEL(memcpy), PLT)
 
 	/* Determine copy direction */
 	cmp	r1, r0
 	bcc	.Lmemmove_backwards
 
 	moveq	r0, #0			/* Quick abort for len=0 */
 	RETeq
 
 	stmdb	sp!, {r0, lr}		/* memmove() returns dest addr */
 	subs	r2, r2, #4
 	blt	.Lmemmove_fl4		/* less than 4 bytes */
 	ands	r12, r0, #3
 	bne	.Lmemmove_fdestul	/* oh unaligned destination addr */
 	ands	r12, r1, #3
 	bne	.Lmemmove_fsrcul		/* oh unaligned source addr */
 
 .Lmemmove_ft8:
 	/* We have aligned source and destination */
 	subs	r2, r2, #8
 	blt	.Lmemmove_fl12		/* less than 12 bytes (4 from above) */
 	subs	r2, r2, #0x14
 	blt	.Lmemmove_fl32		/* less than 32 bytes (12 from above) */
 	stmdb	sp!, {r4}		/* borrow r4 */
 
 	/* blat 32 bytes at a time */
 	/* XXX for really big copies perhaps we should use more registers */
 .Lmemmove_floop32:
 	ldmia	r1!, {r3, r4, r12, lr}
 	stmia	r0!, {r3, r4, r12, lr}
 	ldmia	r1!, {r3, r4, r12, lr}
 	stmia	r0!, {r3, r4, r12, lr}
 	subs	r2, r2, #0x20
 	bge	.Lmemmove_floop32
 
 	cmn	r2, #0x10
 	ldmiage	r1!, {r3, r4, r12, lr}	/* blat a remaining 16 bytes */
 	stmiage	r0!, {r3, r4, r12, lr}
 	subge	r2, r2, #0x10
 	ldmia	sp!, {r4}		/* return r4 */
 
 .Lmemmove_fl32:
 	adds	r2, r2, #0x14
 
 	/* blat 12 bytes at a time */
 .Lmemmove_floop12:
 	ldmiage	r1!, {r3, r12, lr}
 	stmiage	r0!, {r3, r12, lr}
 	subsge	r2, r2, #0x0c
 	bge	.Lmemmove_floop12
 
 .Lmemmove_fl12:
 	adds	r2, r2, #8
 	blt	.Lmemmove_fl4
 
 	subs	r2, r2, #4
 	ldrlt	r3, [r1], #4
 	strlt	r3, [r0], #4
 	ldmiage	r1!, {r3, r12}
 	stmiage	r0!, {r3, r12}
 	subge	r2, r2, #4
 
 .Lmemmove_fl4:
 	/* less than 4 bytes to go */
 	adds	r2, r2, #4
 	ldmiaeq	sp!, {r0, pc}		/* done */
 
 	/* copy the crud byte at a time */
 	cmp	r2, #2
 	ldrb	r3, [r1], #1
 	strb	r3, [r0], #1
 	ldrbge	r3, [r1], #1
 	strbge	r3, [r0], #1
 	ldrbgt	r3, [r1], #1
 	strbgt	r3, [r0], #1
 	ldmia	sp!, {r0, pc}
 
 	/* erg - unaligned destination */
 .Lmemmove_fdestul:
 	rsb	r12, r12, #4
 	cmp	r12, #2
 
 	/* align destination with byte copies */
 	ldrb	r3, [r1], #1
 	strb	r3, [r0], #1
 	ldrbge	r3, [r1], #1
 	strbge	r3, [r0], #1
 	ldrbgt	r3, [r1], #1
 	strbgt	r3, [r0], #1
 	subs	r2, r2, r12
 	blt	.Lmemmove_fl4		/* less the 4 bytes */
 
 	ands	r12, r1, #3
 	beq	.Lmemmove_ft8		/* we have an aligned source */
 
 	/* erg - unaligned source */
 	/* This is where it gets nasty ... */
 .Lmemmove_fsrcul:
 	bic	r1, r1, #3
 	ldr	lr, [r1], #4
 	cmp	r12, #2
 	bgt	.Lmemmove_fsrcul3
 	beq	.Lmemmove_fsrcul2
 	cmp	r2, #0x0c
 	blt	.Lmemmove_fsrcul1loop4
 	sub	r2, r2, #0x0c
 	stmdb	sp!, {r4, r5}
 
 .Lmemmove_fsrcul1loop16:
-#ifdef __ARMEB__
-	mov	r3, lr, lsl #8
-#else
 	mov	r3, lr, lsr #8
-#endif
 	ldmia	r1!, {r4, r5, r12, lr}
-#ifdef __ARMEB__
-	orr	r3, r3, r4, lsr #24
-	mov	r4, r4, lsl #8
-	orr	r4, r4, r5, lsr #24
-	mov	r5, r5, lsl #8
-	orr	r5, r5, r12, lsr #24
-	mov	r12, r12, lsl #8
-	orr	r12, r12, lr, lsr #24
-#else
 	orr	r3, r3, r4, lsl #24
 	mov	r4, r4, lsr #8
 	orr	r4, r4, r5, lsl #24
 	mov	r5, r5, lsr #8
 	orr	r5, r5, r12, lsl #24
 	mov	r12, r12, lsr #8
 	orr	r12, r12, lr, lsl #24
-#endif
 	stmia	r0!, {r3-r5, r12}
 	subs	r2, r2, #0x10
 	bge	.Lmemmove_fsrcul1loop16
 	ldmia	sp!, {r4, r5}
 	adds	r2, r2, #0x0c
 	blt	.Lmemmove_fsrcul1l4
 
 .Lmemmove_fsrcul1loop4:
-#ifdef __ARMEB__
-	mov	r12, lr, lsl #8
-#else
 	mov	r12, lr, lsr #8
-#endif
 	ldr	lr, [r1], #4
-#ifdef __ARMEB__
-	orr	r12, r12, lr, lsr #24
-#else
 	orr	r12, r12, lr, lsl #24
-#endif
 	str	r12, [r0], #4
 	subs	r2, r2, #4
 	bge	.Lmemmove_fsrcul1loop4
 
 .Lmemmove_fsrcul1l4:
 	sub	r1, r1, #3
 	b	.Lmemmove_fl4
 
 .Lmemmove_fsrcul2:
 	cmp	r2, #0x0c
 	blt	.Lmemmove_fsrcul2loop4
 	sub	r2, r2, #0x0c
 	stmdb	sp!, {r4, r5}
 
 .Lmemmove_fsrcul2loop16:
-#ifdef __ARMEB__
-	mov	r3, lr, lsl #16
-#else
 	mov	r3, lr, lsr #16
-#endif
 	ldmia	r1!, {r4, r5, r12, lr}
-#ifdef __ARMEB__
-	orr	r3, r3, r4, lsr #16
-	mov	r4, r4, lsl #16
-	orr	r4, r4, r5, lsr #16
-	mov	r5, r5, lsl #16
-	orr	r5, r5, r12, lsr #16
-	mov	r12, r12, lsl #16
-	orr	r12, r12, lr, lsr #16
-#else
 	orr	r3, r3, r4, lsl #16
 	mov	r4, r4, lsr #16
 	orr	r4, r4, r5, lsl #16
 	mov	r5, r5, lsr #16
 	orr	r5, r5, r12, lsl #16
 	mov	r12, r12, lsr #16
 	orr	r12, r12, lr, lsl #16
-#endif
 	stmia	r0!, {r3-r5, r12}
 	subs	r2, r2, #0x10
 	bge	.Lmemmove_fsrcul2loop16
 	ldmia	sp!, {r4, r5}
 	adds	r2, r2, #0x0c
 	blt	.Lmemmove_fsrcul2l4
 
 .Lmemmove_fsrcul2loop4:
-#ifdef __ARMEB__
-	mov	r12, lr, lsl #16
-#else
 	mov	r12, lr, lsr #16
-#endif
 	ldr	lr, [r1], #4
-#ifdef __ARMEB__
-	orr	r12, r12, lr, lsr #16
-#else
 	orr	r12, r12, lr, lsl #16
-#endif
 	str	r12, [r0], #4
 	subs	r2, r2, #4
 	bge	.Lmemmove_fsrcul2loop4
 
 .Lmemmove_fsrcul2l4:
 	sub	r1, r1, #2
 	b	.Lmemmove_fl4
 
 .Lmemmove_fsrcul3:
 	cmp	r2, #0x0c
 	blt	.Lmemmove_fsrcul3loop4
 	sub	r2, r2, #0x0c
 	stmdb	sp!, {r4, r5}
 
 .Lmemmove_fsrcul3loop16:
-#ifdef __ARMEB__
-	mov	r3, lr, lsl #24
-#else
 	mov	r3, lr, lsr #24
-#endif
 	ldmia	r1!, {r4, r5, r12, lr}
-#ifdef __ARMEB__
-	orr	r3, r3, r4, lsr #8
-	mov	r4, r4, lsl #24
-	orr	r4, r4, r5, lsr #8
-	mov	r5, r5, lsl #24
-	orr	r5, r5, r12, lsr #8
-	mov	r12, r12, lsl #24
-	orr	r12, r12, lr, lsr #8
-#else
 	orr	r3, r3, r4, lsl #8
 	mov	r4, r4, lsr #24
 	orr	r4, r4, r5, lsl #8
 	mov	r5, r5, lsr #24
 	orr	r5, r5, r12, lsl #8
 	mov	r12, r12, lsr #24
 	orr	r12, r12, lr, lsl #8
-#endif
 	stmia	r0!, {r3-r5, r12}
 	subs	r2, r2, #0x10
 	bge	.Lmemmove_fsrcul3loop16
 	ldmia	sp!, {r4, r5}
 	adds	r2, r2, #0x0c
 	blt	.Lmemmove_fsrcul3l4
 
 .Lmemmove_fsrcul3loop4:
-#ifdef __ARMEB__
-	mov	r12, lr, lsl #24
-#else
 	mov	r12, lr, lsr #24
-#endif
 	ldr	lr, [r1], #4
-#ifdef __ARMEB__
-	orr	r12, r12, lr, lsr #8
-#else
 	orr	r12, r12, lr, lsl #8
-#endif
 	str	r12, [r0], #4
 	subs	r2, r2, #4
 	bge	.Lmemmove_fsrcul3loop4
 
 .Lmemmove_fsrcul3l4:
 	sub	r1, r1, #1
 	b	.Lmemmove_fl4
 
 .Lmemmove_backwards:
 	add	r1, r1, r2
 	add	r0, r0, r2
 	subs	r2, r2, #4
 	blt	.Lmemmove_bl4		/* less than 4 bytes */
 	ands	r12, r0, #3
 	bne	.Lmemmove_bdestul	/* oh unaligned destination addr */
 	ands	r12, r1, #3
 	bne	.Lmemmove_bsrcul		/* oh unaligned source addr */
 
 .Lmemmove_bt8:
 	/* We have aligned source and destination */
 	subs	r2, r2, #8
 	blt	.Lmemmove_bl12		/* less than 12 bytes (4 from above) */
 	stmdb	sp!, {r4, lr}
 	subs	r2, r2, #0x14		/* less than 32 bytes (12 from above) */
 	blt	.Lmemmove_bl32
 
 	/* blat 32 bytes at a time */
 	/* XXX for really big copies perhaps we should use more registers */
 .Lmemmove_bloop32:
 	ldmdb	r1!, {r3, r4, r12, lr}
 	stmdb	r0!, {r3, r4, r12, lr}
 	ldmdb	r1!, {r3, r4, r12, lr}
 	stmdb	r0!, {r3, r4, r12, lr}
 	subs	r2, r2, #0x20
 	bge	.Lmemmove_bloop32
 
 .Lmemmove_bl32:
 	cmn	r2, #0x10
 	ldmdbge	r1!, {r3, r4, r12, lr}	/* blat a remaining 16 bytes */
 	stmdbge	r0!, {r3, r4, r12, lr}
 	subge	r2, r2, #0x10
 	adds	r2, r2, #0x14
 	ldmdbge	r1!, {r3, r12, lr}	/* blat a remaining 12 bytes */
 	stmdbge	r0!, {r3, r12, lr}
 	subge	r2, r2, #0x0c
 	ldmia	sp!, {r4, lr}
 
 .Lmemmove_bl12:
 	adds	r2, r2, #8
 	blt	.Lmemmove_bl4
 	subs	r2, r2, #4
 	ldrlt	r3, [r1, #-4]!
 	strlt	r3, [r0, #-4]!
 	ldmdbge	r1!, {r3, r12}
 	stmdbge	r0!, {r3, r12}
 	subge	r2, r2, #4
 
 .Lmemmove_bl4:
 	/* less than 4 bytes to go */
 	adds	r2, r2, #4
 	RETeq			/* done */
 
 	/* copy the crud byte at a time */
 	cmp	r2, #2
 	ldrb	r3, [r1, #-1]!
 	strb	r3, [r0, #-1]!
 	ldrbge	r3, [r1, #-1]!
 	strbge	r3, [r0, #-1]!
 	ldrbgt	r3, [r1, #-1]!
 	strbgt	r3, [r0, #-1]!
 	RET
 
 	/* erg - unaligned destination */
 .Lmemmove_bdestul:
 	cmp	r12, #2
 
 	/* align destination with byte copies */
 	ldrb	r3, [r1, #-1]!
 	strb	r3, [r0, #-1]!
 	ldrbge	r3, [r1, #-1]!
 	strbge	r3, [r0, #-1]!
 	ldrbgt	r3, [r1, #-1]!
 	strbgt	r3, [r0, #-1]!
 	subs	r2, r2, r12
 	blt	.Lmemmove_bl4		/* less than 4 bytes to go */
 	ands	r12, r1, #3
 	beq	.Lmemmove_bt8		/* we have an aligned source */
 
 	/* erg - unaligned source */
 	/* This is where it gets nasty ... */
 .Lmemmove_bsrcul:
 	bic	r1, r1, #3
 	ldr	r3, [r1, #0]
 	cmp	r12, #2
 	blt	.Lmemmove_bsrcul1
 	beq	.Lmemmove_bsrcul2
 	cmp	r2, #0x0c
 	blt	.Lmemmove_bsrcul3loop4
 	sub	r2, r2, #0x0c
 	stmdb	sp!, {r4, r5, lr}
 
 .Lmemmove_bsrcul3loop16:
-#ifdef __ARMEB__
-	mov	lr, r3, lsr #8
-#else
 	mov	lr, r3, lsl #8
-#endif
 	ldmdb	r1!, {r3-r5, r12}
-#ifdef __ARMEB__
-	orr	lr, lr, r12, lsl #24
-	mov	r12, r12, lsr #8
-	orr	r12, r12, r5, lsl #24
-	mov	r5, r5, lsr #8
-	orr	r5, r5, r4, lsl #24
-	mov	r4, r4, lsr #8
-	orr	r4, r4, r3, lsl #24
-#else
 	orr	lr, lr, r12, lsr #24
 	mov	r12, r12, lsl #8
 	orr	r12, r12, r5, lsr #24
 	mov	r5, r5, lsl #8
 	orr	r5, r5, r4, lsr #24
 	mov	r4, r4, lsl #8
 	orr	r4, r4, r3, lsr #24
-#endif
 	stmdb	r0!, {r4, r5, r12, lr}
 	subs	r2, r2, #0x10
 	bge	.Lmemmove_bsrcul3loop16
 	ldmia	sp!, {r4, r5, lr}
 	adds	r2, r2, #0x0c
 	blt	.Lmemmove_bsrcul3l4
 
 .Lmemmove_bsrcul3loop4:
-#ifdef __ARMEB__
-	mov	r12, r3, lsr #8
-#else
 	mov	r12, r3, lsl #8
-#endif
 	ldr	r3, [r1, #-4]!
-#ifdef __ARMEB__
-	orr	r12, r12, r3, lsl #24
-#else
 	orr	r12, r12, r3, lsr #24
-#endif
 	str	r12, [r0, #-4]!
 	subs	r2, r2, #4
 	bge	.Lmemmove_bsrcul3loop4
 
 .Lmemmove_bsrcul3l4:
 	add	r1, r1, #3
 	b	.Lmemmove_bl4
 
 .Lmemmove_bsrcul2:
 	cmp	r2, #0x0c
 	blt	.Lmemmove_bsrcul2loop4
 	sub	r2, r2, #0x0c
 	stmdb	sp!, {r4, r5, lr}
 
 .Lmemmove_bsrcul2loop16:
-#ifdef __ARMEB__
-	mov	lr, r3, lsr #16
-#else
 	mov	lr, r3, lsl #16
-#endif
 	ldmdb	r1!, {r3-r5, r12}
-#ifdef __ARMEB__
-	orr	lr, lr, r12, lsl #16
-	mov	r12, r12, lsr #16
-	orr	r12, r12, r5, lsl #16
-	mov	r5, r5, lsr #16
-	orr	r5, r5, r4, lsl #16
-	mov	r4, r4, lsr #16
-	orr	r4, r4, r3, lsl #16
-#else
 	orr	lr, lr, r12, lsr #16
 	mov	r12, r12, lsl #16
 	orr	r12, r12, r5, lsr #16
 	mov	r5, r5, lsl #16
 	orr	r5, r5, r4, lsr #16
 	mov	r4, r4, lsl #16
 	orr	r4, r4, r3, lsr #16
-#endif
 	stmdb	r0!, {r4, r5, r12, lr}
 	subs	r2, r2, #0x10
 	bge	.Lmemmove_bsrcul2loop16
 	ldmia	sp!, {r4, r5, lr}
 	adds	r2, r2, #0x0c
 	blt	.Lmemmove_bsrcul2l4
 
 .Lmemmove_bsrcul2loop4:
-#ifdef __ARMEB__
-	mov	r12, r3, lsr #16
-#else
 	mov	r12, r3, lsl #16
-#endif
 	ldr	r3, [r1, #-4]!
-#ifdef __ARMEB__
-	orr	r12, r12, r3, lsl #16
-#else
 	orr	r12, r12, r3, lsr #16
-#endif
 	str	r12, [r0, #-4]!
 	subs	r2, r2, #4
 	bge	.Lmemmove_bsrcul2loop4
 
 .Lmemmove_bsrcul2l4:
 	add	r1, r1, #2
 	b	.Lmemmove_bl4
 
 .Lmemmove_bsrcul1:
 	cmp	r2, #0x0c
 	blt	.Lmemmove_bsrcul1loop4
 	sub	r2, r2, #0x0c
 	stmdb	sp!, {r4, r5, lr}
 
 .Lmemmove_bsrcul1loop32:
-#ifdef __ARMEB__
-	mov	lr, r3, lsr #24
-#else
 	mov	lr, r3, lsl #24
-#endif
 	ldmdb	r1!, {r3-r5, r12}
-#ifdef __ARMEB__
-	orr	lr, lr, r12, lsl #8
-	mov	r12, r12, lsr #24
-	orr	r12, r12, r5, lsl #8
-	mov	r5, r5, lsr #24
-	orr	r5, r5, r4, lsl #8
-	mov	r4, r4, lsr #24
-	orr	r4, r4, r3, lsl #8
-#else
 	orr	lr, lr, r12, lsr #8
 	mov	r12, r12, lsl #24
 	orr	r12, r12, r5, lsr #8
 	mov	r5, r5, lsl #24
 	orr	r5, r5, r4, lsr #8
 	mov	r4, r4, lsl #24
 	orr	r4, r4, r3, lsr #8
-#endif
 	stmdb	r0!, {r4, r5, r12, lr}
 	subs	r2, r2, #0x10
 	bge	.Lmemmove_bsrcul1loop32
 	ldmia	sp!, {r4, r5, lr}
 	adds	r2, r2, #0x0c
 	blt	.Lmemmove_bsrcul1l4
 
 .Lmemmove_bsrcul1loop4:
-#ifdef __ARMEB__
-	mov	r12, r3, lsr #24
-#else
 	mov	r12, r3, lsl #24
-#endif
 	ldr	r3, [r1, #-4]!
-#ifdef __ARMEB__
-	orr	r12, r12, r3, lsl #8
-#else
 	orr	r12, r12, r3, lsr #8
-#endif
 	str	r12, [r0, #-4]!
 	subs	r2, r2, #4
 	bge	.Lmemmove_bsrcul1loop4
 
 .Lmemmove_bsrcul1l4:
 	add	r1, r1, #1
 	b	.Lmemmove_bl4
 EEND(memmove)
 END(bcopy)
 
 #if !defined(_ARM_ARCH_5E)
 ENTRY(memcpy)
 	/* save leaf functions having to store this away */
 	/* Do not check arm_memcpy if we're running from flash */
 #if defined(FLASHADDR) && defined(PHYSADDR)
 #if FLASHADDR > PHYSADDR
 	ldr	r3, =FLASHADDR
 	cmp	r3, pc
 	bls	.Lnormal
 #else
 	ldr	r3, =FLASHADDR
 	cmp	r3, pc
 	bhi	.Lnormal
 #endif
 #endif
 	ldr	r3, .L_arm_memcpy
 	ldr	r3, [r3]
 	cmp	r3, #0
 	beq	.Lnormal
 	ldr	r3, .L_min_memcpy_size
 	ldr	r3, [r3]
 	cmp	r2, r3
 	blt	.Lnormal
 	stmfd	sp!, {r0-r2, r4, lr}
 	mov	r3, #0
 	ldr	r4, .L_arm_memcpy
 	mov	lr, pc
 	ldr	pc, [r4]
 	cmp	r0, #0
 	ldmfd	sp!, {r0-r2, r4, lr}
 	RETeq
 
 .Lnormal:
 	stmdb	sp!, {r0, lr}		/* memcpy() returns dest addr */
 
 	subs	r2, r2, #4
 	blt	.Lmemcpy_l4		/* less than 4 bytes */
 	ands	r12, r0, #3
 	bne	.Lmemcpy_destul		/* oh unaligned destination addr */
 	ands	r12, r1, #3
 	bne	.Lmemcpy_srcul		/* oh unaligned source addr */
 
 .Lmemcpy_t8:
 	/* We have aligned source and destination */
 	subs	r2, r2, #8
 	blt	.Lmemcpy_l12		/* less than 12 bytes (4 from above) */
 	subs	r2, r2, #0x14
 	blt	.Lmemcpy_l32		/* less than 32 bytes (12 from above) */
 	stmdb	sp!, {r4}		/* borrow r4 */
 
 	/* blat 32 bytes at a time */
 	/* XXX for really big copies perhaps we should use more registers */
 .Lmemcpy_loop32:
 	ldmia	r1!, {r3, r4, r12, lr}
 	stmia	r0!, {r3, r4, r12, lr}
 	ldmia	r1!, {r3, r4, r12, lr}
 	stmia	r0!, {r3, r4, r12, lr}
 	subs	r2, r2, #0x20
 	bge	.Lmemcpy_loop32
 
 	cmn	r2, #0x10
 	ldmiage	r1!, {r3, r4, r12, lr}	/* blat a remaining 16 bytes */
 	stmiage	r0!, {r3, r4, r12, lr}
 	subge	r2, r2, #0x10
 	ldmia	sp!, {r4}		/* return r4 */
 
 .Lmemcpy_l32:
 	adds	r2, r2, #0x14
 
 	/* blat 12 bytes at a time */
 .Lmemcpy_loop12:
 	ldmiage	r1!, {r3, r12, lr}
 	stmiage	r0!, {r3, r12, lr}
 	subsge	r2, r2, #0x0c
 	bge	.Lmemcpy_loop12
 
 .Lmemcpy_l12:
 	adds	r2, r2, #8
 	blt	.Lmemcpy_l4
 
 	subs	r2, r2, #4
 	ldrlt	r3, [r1], #4
 	strlt	r3, [r0], #4
 	ldmiage	r1!, {r3, r12}
 	stmiage	r0!, {r3, r12}
 	subge	r2, r2, #4
 
 .Lmemcpy_l4:
 	/* less than 4 bytes to go */
 	adds	r2, r2, #4
 #ifdef __APCS_26_
 	ldmiaeq sp!, {r0, pc}^		/* done */
 #else
 	ldmiaeq	sp!, {r0, pc}		/* done */
 #endif
 	/* copy the crud byte at a time */
 	cmp	r2, #2
 	ldrb	r3, [r1], #1
 	strb	r3, [r0], #1
 	ldrbge	r3, [r1], #1
 	strbge	r3, [r0], #1
 	ldrbgt	r3, [r1], #1
 	strbgt	r3, [r0], #1
 	ldmia	sp!, {r0, pc}
 
 	/* erg - unaligned destination */
 .Lmemcpy_destul:
 	rsb	r12, r12, #4
 	cmp	r12, #2
 
 	/* align destination with byte copies */
 	ldrb	r3, [r1], #1
 	strb	r3, [r0], #1
 	ldrbge	r3, [r1], #1
 	strbge	r3, [r0], #1
 	ldrbgt	r3, [r1], #1
 	strbgt	r3, [r0], #1
 	subs	r2, r2, r12
 	blt	.Lmemcpy_l4		/* less the 4 bytes */
 
 	ands	r12, r1, #3
 	beq	.Lmemcpy_t8		/* we have an aligned source */
 
 	/* erg - unaligned source */
 	/* This is where it gets nasty ... */
 .Lmemcpy_srcul:
 	bic	r1, r1, #3
 	ldr	lr, [r1], #4
 	cmp	r12, #2
 	bgt	.Lmemcpy_srcul3
 	beq	.Lmemcpy_srcul2
 	cmp	r2, #0x0c
 	blt	.Lmemcpy_srcul1loop4
 	sub	r2, r2, #0x0c
 	stmdb	sp!, {r4, r5}
 
 .Lmemcpy_srcul1loop16:
 	mov	r3, lr, lsr #8
 	ldmia	r1!, {r4, r5, r12, lr}
 	orr	r3, r3, r4, lsl #24
 	mov	r4, r4, lsr #8
 	orr	r4, r4, r5, lsl #24
 	mov	r5, r5, lsr #8
 	orr	r5, r5, r12, lsl #24
 	mov	r12, r12, lsr #8
 	orr	r12, r12, lr, lsl #24
 	stmia	r0!, {r3-r5, r12}
 	subs	r2, r2, #0x10
 	bge	.Lmemcpy_srcul1loop16
 	ldmia	sp!, {r4, r5}
 	adds	r2, r2, #0x0c
 	blt	.Lmemcpy_srcul1l4
 
 .Lmemcpy_srcul1loop4:
 	mov	r12, lr, lsr #8
 	ldr	lr, [r1], #4
 	orr	r12, r12, lr, lsl #24
 	str	r12, [r0], #4
 	subs	r2, r2, #4
 	bge	.Lmemcpy_srcul1loop4
 
 .Lmemcpy_srcul1l4:
 	sub	r1, r1, #3
 	b	.Lmemcpy_l4
 
 .Lmemcpy_srcul2:
 	cmp	r2, #0x0c
 	blt	.Lmemcpy_srcul2loop4
 	sub	r2, r2, #0x0c
 	stmdb	sp!, {r4, r5}
 
 .Lmemcpy_srcul2loop16:
 	mov	r3, lr, lsr #16
 	ldmia	r1!, {r4, r5, r12, lr}
 	orr	r3, r3, r4, lsl #16
 	mov	r4, r4, lsr #16
 	orr	r4, r4, r5, lsl #16
 	mov	r5, r5, lsr #16
 	orr	r5, r5, r12, lsl #16
 	mov	r12, r12, lsr #16
 	orr	r12, r12, lr, lsl #16
 	stmia	r0!, {r3-r5, r12}
 	subs	r2, r2, #0x10
 	bge	.Lmemcpy_srcul2loop16
 	ldmia	sp!, {r4, r5}
 	adds	r2, r2, #0x0c
 	blt	.Lmemcpy_srcul2l4
 
 .Lmemcpy_srcul2loop4:
 	mov	r12, lr, lsr #16
 	ldr	lr, [r1], #4
 	orr	r12, r12, lr, lsl #16
 	str	r12, [r0], #4
 	subs	r2, r2, #4
 	bge	.Lmemcpy_srcul2loop4
 
 .Lmemcpy_srcul2l4:
 	sub	r1, r1, #2
 	b	.Lmemcpy_l4
 
 .Lmemcpy_srcul3:
 	cmp	r2, #0x0c
 	blt	.Lmemcpy_srcul3loop4
 	sub	r2, r2, #0x0c
 	stmdb	sp!, {r4, r5}
 
 .Lmemcpy_srcul3loop16:
 	mov	r3, lr, lsr #24
 	ldmia	r1!, {r4, r5, r12, lr}
 	orr	r3, r3, r4, lsl #8
 	mov	r4, r4, lsr #24
 	orr	r4, r4, r5, lsl #8
 	mov	r5, r5, lsr #24
 	orr	r5, r5, r12, lsl #8
 	mov	r12, r12, lsr #24
 	orr	r12, r12, lr, lsl #8
 	stmia	r0!, {r3-r5, r12}
 	subs	r2, r2, #0x10
 	bge	.Lmemcpy_srcul3loop16
 	ldmia	sp!, {r4, r5}
 	adds	r2, r2, #0x0c
 	blt	.Lmemcpy_srcul3l4
 
 .Lmemcpy_srcul3loop4:
 	mov	r12, lr, lsr #24
 	ldr	lr, [r1], #4
 	orr	r12, r12, lr, lsl #8
 	str	r12, [r0], #4
 	subs	r2, r2, #4
 	bge	.Lmemcpy_srcul3loop4
 
 .Lmemcpy_srcul3l4:
 	sub	r1, r1, #1
 	b	.Lmemcpy_l4
 END(memcpy)
 
 #else
 /* LINTSTUB: Func: void *memcpy(void *dst, const void *src, size_t len) */
 ENTRY(memcpy)
 	pld	[r1]
 	cmp	r2, #0x0c
 	ble	.Lmemcpy_short		/* <= 12 bytes */
 #ifdef FLASHADDR
 #if FLASHADDR > PHYSADDR
 	ldr	r3, =FLASHADDR
 	cmp	r3, pc
 	bls	.Lnormal
 #else
 	ldr	r3, =FLASHADDR
 	cmp	r3, pc
 	bhi	.Lnormal
 #endif
 #endif
 	ldr	r3, .L_arm_memcpy
 	ldr	r3, [r3]
 	cmp	r3, #0
 	beq	.Lnormal
 	ldr	r3, .L_min_memcpy_size
 	ldr	r3, [r3]
 	cmp	r2, r3
 	blt	.Lnormal
 	stmfd	sp!, {r0-r2, r4, lr}
 	mov	r3, #0
 	ldr	r4, .L_arm_memcpy
 	mov	lr, pc
 	ldr	pc, [r4]
 	cmp	r0, #0
 	ldmfd	sp!, {r0-r2, r4, lr}
 	RETeq
 .Lnormal:
 	mov	r3, r0			/* We must not clobber r0 */
 
 	/* Word-align the destination buffer */
 	ands	ip, r3, #0x03		/* Already word aligned? */
 	beq	.Lmemcpy_wordaligned	/* Yup */
 	cmp	ip, #0x02
 	ldrb	ip, [r1], #0x01
 	sub	r2, r2, #0x01
 	strb	ip, [r3], #0x01
 	ldrble	ip, [r1], #0x01
 	suble	r2, r2, #0x01
 	strble	ip, [r3], #0x01
 	ldrblt	ip, [r1], #0x01
 	sublt	r2, r2, #0x01
 	strblt	ip, [r3], #0x01
 
 	/* Destination buffer is now word aligned */
 .Lmemcpy_wordaligned:
 	ands	ip, r1, #0x03		/* Is src also word-aligned? */
 	bne	.Lmemcpy_bad_align	/* Nope. Things just got bad */
 
 	/* Quad-align the destination buffer */
 	tst	r3, #0x07		/* Already quad aligned? */
 	ldrne	ip, [r1], #0x04
 	stmfd	sp!, {r4-r9}		/* Free up some registers */
 	subne	r2, r2, #0x04
 	strne	ip, [r3], #0x04
 
 	/* Destination buffer quad aligned, source is at least word aligned */
 	subs	r2, r2, #0x80
 	blt	.Lmemcpy_w_lessthan128
 
 	/* Copy 128 bytes at a time */
 .Lmemcpy_w_loop128:
 	ldr	r4, [r1], #0x04		/* LD:00-03 */
 	ldr	r5, [r1], #0x04		/* LD:04-07 */
 	pld	[r1, #0x18]		/* Prefetch 0x20 */
 	ldr	r6, [r1], #0x04		/* LD:08-0b */
 	ldr	r7, [r1], #0x04		/* LD:0c-0f */
 	ldr	r8, [r1], #0x04		/* LD:10-13 */
 	ldr	r9, [r1], #0x04		/* LD:14-17 */
 	strd	r4, [r3], #0x08		/* ST:00-07 */
 	ldr	r4, [r1], #0x04		/* LD:18-1b */
 	ldr	r5, [r1], #0x04		/* LD:1c-1f */
 	strd	r6, [r3], #0x08		/* ST:08-0f */
 	ldr	r6, [r1], #0x04		/* LD:20-23 */
 	ldr	r7, [r1], #0x04		/* LD:24-27 */
 	pld	[r1, #0x18]		/* Prefetch 0x40 */
 	strd	r8, [r3], #0x08		/* ST:10-17 */
 	ldr	r8, [r1], #0x04		/* LD:28-2b */
 	ldr	r9, [r1], #0x04		/* LD:2c-2f */
 	strd	r4, [r3], #0x08		/* ST:18-1f */
 	ldr	r4, [r1], #0x04		/* LD:30-33 */
 	ldr	r5, [r1], #0x04		/* LD:34-37 */
 	strd	r6, [r3], #0x08		/* ST:20-27 */
 	ldr	r6, [r1], #0x04		/* LD:38-3b */
 	ldr	r7, [r1], #0x04		/* LD:3c-3f */
 	strd	r8, [r3], #0x08		/* ST:28-2f */
 	ldr	r8, [r1], #0x04		/* LD:40-43 */
 	ldr	r9, [r1], #0x04		/* LD:44-47 */
 	pld	[r1, #0x18]		/* Prefetch 0x60 */
 	strd	r4, [r3], #0x08		/* ST:30-37 */
 	ldr	r4, [r1], #0x04		/* LD:48-4b */
 	ldr	r5, [r1], #0x04		/* LD:4c-4f */
 	strd	r6, [r3], #0x08		/* ST:38-3f */
 	ldr	r6, [r1], #0x04		/* LD:50-53 */
 	ldr	r7, [r1], #0x04		/* LD:54-57 */
 	strd	r8, [r3], #0x08		/* ST:40-47 */
 	ldr	r8, [r1], #0x04		/* LD:58-5b */
 	ldr	r9, [r1], #0x04		/* LD:5c-5f */
 	strd	r4, [r3], #0x08		/* ST:48-4f */
 	ldr	r4, [r1], #0x04		/* LD:60-63 */
 	ldr	r5, [r1], #0x04		/* LD:64-67 */
 	pld	[r1, #0x18]		/* Prefetch 0x80 */
 	strd	r6, [r3], #0x08		/* ST:50-57 */
 	ldr	r6, [r1], #0x04		/* LD:68-6b */
 	ldr	r7, [r1], #0x04		/* LD:6c-6f */
 	strd	r8, [r3], #0x08		/* ST:58-5f */
 	ldr	r8, [r1], #0x04		/* LD:70-73 */
 	ldr	r9, [r1], #0x04		/* LD:74-77 */
 	strd	r4, [r3], #0x08		/* ST:60-67 */
 	ldr	r4, [r1], #0x04		/* LD:78-7b */
 	ldr	r5, [r1], #0x04		/* LD:7c-7f */
 	strd	r6, [r3], #0x08		/* ST:68-6f */
 	strd	r8, [r3], #0x08		/* ST:70-77 */
 	subs	r2, r2, #0x80
 	strd	r4, [r3], #0x08		/* ST:78-7f */
 	bge	.Lmemcpy_w_loop128
 
 .Lmemcpy_w_lessthan128:
 	adds	r2, r2, #0x80		/* Adjust for extra sub */
 	ldmfdeq	sp!, {r4-r9}
 	RETeq			/* Return now if done */
 	subs	r2, r2, #0x20
 	blt	.Lmemcpy_w_lessthan32
 
 	/* Copy 32 bytes at a time */
 .Lmemcpy_w_loop32:
 	ldr	r4, [r1], #0x04
 	ldr	r5, [r1], #0x04
 	pld	[r1, #0x18]
 	ldr	r6, [r1], #0x04
 	ldr	r7, [r1], #0x04
 	ldr	r8, [r1], #0x04
 	ldr	r9, [r1], #0x04
 	strd	r4, [r3], #0x08
 	ldr	r4, [r1], #0x04
 	ldr	r5, [r1], #0x04
 	strd	r6, [r3], #0x08
 	strd	r8, [r3], #0x08
 	subs	r2, r2, #0x20
 	strd	r4, [r3], #0x08
 	bge	.Lmemcpy_w_loop32
 
 .Lmemcpy_w_lessthan32:
 	adds	r2, r2, #0x20		/* Adjust for extra sub */
 	ldmfdeq	sp!, {r4-r9}
 	RETeq			/* Return now if done */
 
 	and	r4, r2, #0x18
 	rsbs	r4, r4, #0x18
 	addne	pc, pc, r4, lsl #1
 	nop
 
 	/* At least 24 bytes remaining */
 	ldr	r4, [r1], #0x04
 	ldr	r5, [r1], #0x04
 	sub	r2, r2, #0x08
 	strd	r4, [r3], #0x08
 
 	/* At least 16 bytes remaining */
 	ldr	r4, [r1], #0x04
 	ldr	r5, [r1], #0x04
 	sub	r2, r2, #0x08
 	strd	r4, [r3], #0x08
 
 	/* At least 8 bytes remaining */
 	ldr	r4, [r1], #0x04
 	ldr	r5, [r1], #0x04
 	subs	r2, r2, #0x08
 	strd	r4, [r3], #0x08
 
 	/* Less than 8 bytes remaining */
 	ldmfd	sp!, {r4-r9}
 	RETeq			/* Return now if done */
 	subs	r2, r2, #0x04
 	ldrge	ip, [r1], #0x04
 	strge	ip, [r3], #0x04
 	RETeq			/* Return now if done */
 	addlt	r2, r2, #0x04
 	ldrb	ip, [r1], #0x01
 	cmp	r2, #0x02
 	ldrbge	r2, [r1], #0x01
 	strb	ip, [r3], #0x01
 	ldrbgt	ip, [r1]
 	strbge	r2, [r3], #0x01
 	strbgt	ip, [r3]
 	RET
 /* Place a literal pool here for the above ldr instructions to use */
 .ltorg
 
 
 /*
  * At this point, it has not been possible to word align both buffers.
  * The destination buffer is word aligned, but the source buffer is not.
  */
 .Lmemcpy_bad_align:
 	stmfd	sp!, {r4-r7}
 	bic	r1, r1, #0x03
 	cmp	ip, #2
 	ldr	ip, [r1], #0x04
 	bgt	.Lmemcpy_bad3
 	beq	.Lmemcpy_bad2
 	b	.Lmemcpy_bad1
 
 .Lmemcpy_bad1_loop16:
-#ifdef __ARMEB__
-	mov	r4, ip, lsl #8
-#else
 	mov	r4, ip, lsr #8
-#endif
 	ldr	r5, [r1], #0x04
 	pld	[r1, #0x018]
 	ldr	r6, [r1], #0x04
 	ldr	r7, [r1], #0x04
 	ldr	ip, [r1], #0x04
-#ifdef __ARMEB__
-	orr	r4, r4, r5, lsr #24
-	mov	r5, r5, lsl #8
-	orr	r5, r5, r6, lsr #24
-	mov	r6, r6, lsl #8
-	orr	r6, r6, r7, lsr #24
-	mov	r7, r7, lsl #8
-	orr	r7, r7, ip, lsr #24
-#else
 	orr	r4, r4, r5, lsl #24
 	mov	r5, r5, lsr #8
 	orr	r5, r5, r6, lsl #24
 	mov	r6, r6, lsr #8
 	orr	r6, r6, r7, lsl #24
 	mov	r7, r7, lsr #8
 	orr	r7, r7, ip, lsl #24
-#endif
 	str	r4, [r3], #0x04
 	str	r5, [r3], #0x04
 	str	r6, [r3], #0x04
 	str	r7, [r3], #0x04
 .Lmemcpy_bad1:
 	subs	r2, r2, #0x10
 	bge	.Lmemcpy_bad1_loop16
 
 	adds	r2, r2, #0x10
 	ldmfdeq	sp!, {r4-r7}
 	RETeq			/* Return now if done */
 	subs	r2, r2, #0x04
 	sublt	r1, r1, #0x03
 	blt	.Lmemcpy_bad_done
 
 .Lmemcpy_bad1_loop4:
-#ifdef __ARMEB__
-	mov	r4, ip, lsl #8
-#else
 	mov	r4, ip, lsr #8
-#endif
 	ldr	ip, [r1], #0x04
 	subs	r2, r2, #0x04
-#ifdef __ARMEB__
-	orr	r4, r4, ip, lsr #24
-#else
 	orr	r4, r4, ip, lsl #24
-#endif
 	str	r4, [r3], #0x04
 	bge	.Lmemcpy_bad1_loop4
 	sub	r1, r1, #0x03
 	b	.Lmemcpy_bad_done
 
 .Lmemcpy_bad2_loop16:
-#ifdef __ARMEB__
-	mov	r4, ip, lsl #16
-#else
 	mov	r4, ip, lsr #16
-#endif
 	ldr	r5, [r1], #0x04
 	pld	[r1, #0x018]
 	ldr	r6, [r1], #0x04
 	ldr	r7, [r1], #0x04
 	ldr	ip, [r1], #0x04
-#ifdef __ARMEB__
-	orr	r4, r4, r5, lsr #16
-	mov	r5, r5, lsl #16
-	orr	r5, r5, r6, lsr #16
-	mov	r6, r6, lsl #16
-	orr	r6, r6, r7, lsr #16
-	mov	r7, r7, lsl #16
-	orr	r7, r7, ip, lsr #16
-#else
 	orr	r4, r4, r5, lsl #16
 	mov	r5, r5, lsr #16
 	orr	r5, r5, r6, lsl #16
 	mov	r6, r6, lsr #16
 	orr	r6, r6, r7, lsl #16
 	mov	r7, r7, lsr #16
 	orr	r7, r7, ip, lsl #16
-#endif
 	str	r4, [r3], #0x04
 	str	r5, [r3], #0x04
 	str	r6, [r3], #0x04
 	str	r7, [r3], #0x04
 .Lmemcpy_bad2:
 	subs	r2, r2, #0x10
 	bge	.Lmemcpy_bad2_loop16
 
 	adds	r2, r2, #0x10
 	ldmfdeq	sp!, {r4-r7}
 	RETeq			/* Return now if done */
 	subs	r2, r2, #0x04
 	sublt	r1, r1, #0x02
 	blt	.Lmemcpy_bad_done
 
 .Lmemcpy_bad2_loop4:
-#ifdef __ARMEB__
-	mov	r4, ip, lsl #16
-#else
 	mov	r4, ip, lsr #16
-#endif
 	ldr	ip, [r1], #0x04
 	subs	r2, r2, #0x04
-#ifdef __ARMEB__
-	orr	r4, r4, ip, lsr #16
-#else
 	orr	r4, r4, ip, lsl #16
-#endif
 	str	r4, [r3], #0x04
 	bge	.Lmemcpy_bad2_loop4
 	sub	r1, r1, #0x02
 	b	.Lmemcpy_bad_done
 
 .Lmemcpy_bad3_loop16:
-#ifdef __ARMEB__
-	mov	r4, ip, lsl #24
-#else
 	mov	r4, ip, lsr #24
-#endif
 	ldr	r5, [r1], #0x04
 	pld	[r1, #0x018]
 	ldr	r6, [r1], #0x04
 	ldr	r7, [r1], #0x04
 	ldr	ip, [r1], #0x04
-#ifdef __ARMEB__
-	orr	r4, r4, r5, lsr #8
-	mov	r5, r5, lsl #24
-	orr	r5, r5, r6, lsr #8
-	mov	r6, r6, lsl #24
-	orr	r6, r6, r7, lsr #8
-	mov	r7, r7, lsl #24
-	orr	r7, r7, ip, lsr #8
-#else
 	orr	r4, r4, r5, lsl #8
 	mov	r5, r5, lsr #24
 	orr	r5, r5, r6, lsl #8
 	mov	r6, r6, lsr #24
 	orr	r6, r6, r7, lsl #8
 	mov	r7, r7, lsr #24
 	orr	r7, r7, ip, lsl #8
-#endif
 	str	r4, [r3], #0x04
 	str	r5, [r3], #0x04
 	str	r6, [r3], #0x04
 	str	r7, [r3], #0x04
 .Lmemcpy_bad3:
 	subs	r2, r2, #0x10
 	bge	.Lmemcpy_bad3_loop16
 
 	adds	r2, r2, #0x10
 	ldmfdeq	sp!, {r4-r7}
 	RETeq			/* Return now if done */
 	subs	r2, r2, #0x04
 	sublt	r1, r1, #0x01
 	blt	.Lmemcpy_bad_done
 
 .Lmemcpy_bad3_loop4:
-#ifdef __ARMEB__
-	mov	r4, ip, lsl #24
-#else
 	mov	r4, ip, lsr #24
-#endif
 	ldr	ip, [r1], #0x04
 	subs	r2, r2, #0x04
-#ifdef __ARMEB__
-	orr	r4, r4, ip, lsr #8
-#else
 	orr	r4, r4, ip, lsl #8
-#endif
 	str	r4, [r3], #0x04
 	bge	.Lmemcpy_bad3_loop4
 	sub	r1, r1, #0x01
 
 .Lmemcpy_bad_done:
 	ldmfd	sp!, {r4-r7}
 	adds	r2, r2, #0x04
 	RETeq
 	ldrb	ip, [r1], #0x01
 	cmp	r2, #0x02
 	ldrbge	r2, [r1], #0x01
 	strb	ip, [r3], #0x01
 	ldrbgt	ip, [r1]
 	strbge	r2, [r3], #0x01
 	strbgt	ip, [r3]
 	RET
 
 
 /*
  * Handle short copies (less than 16 bytes), possibly misaligned.
  * Some of these are *very* common, thanks to the network stack,
  * and so are handled specially.
  */
 .Lmemcpy_short:
 	add	pc, pc, r2, lsl #2
 	nop
 	RET			/* 0x00 */
 	b	.Lmemcpy_bytewise	/* 0x01 */
 	b	.Lmemcpy_bytewise	/* 0x02 */
 	b	.Lmemcpy_bytewise	/* 0x03 */
 	b	.Lmemcpy_4		/* 0x04 */
 	b	.Lmemcpy_bytewise	/* 0x05 */
 	b	.Lmemcpy_6		/* 0x06 */
 	b	.Lmemcpy_bytewise	/* 0x07 */
 	b	.Lmemcpy_8		/* 0x08 */
 	b	.Lmemcpy_bytewise	/* 0x09 */
 	b	.Lmemcpy_bytewise	/* 0x0a */
 	b	.Lmemcpy_bytewise	/* 0x0b */
 	b	.Lmemcpy_c		/* 0x0c */
 .Lmemcpy_bytewise:
 	mov	r3, r0			/* We must not clobber r0 */
 	ldrb	ip, [r1], #0x01
 1:	subs	r2, r2, #0x01
 	strb	ip, [r3], #0x01
 	ldrbne	ip, [r1], #0x01
 	bne	1b
 	RET
 
 /******************************************************************************
  * Special case for 4 byte copies
  */
 #define	LMEMCPY_4_LOG2	6	/* 64 bytes */
 #define	LMEMCPY_4_PAD	.align LMEMCPY_4_LOG2
 	LMEMCPY_4_PAD
 .Lmemcpy_4:
 	and	r2, r1, #0x03
 	orr	r2, r2, r0, lsl #2
 	ands	r2, r2, #0x0f
 	sub	r3, pc, #0x14
 	addne	pc, r3, r2, lsl #LMEMCPY_4_LOG2
 
 /*
  * 0000: dst is 32-bit aligned, src is 32-bit aligned
  */
 	ldr	r2, [r1]
 	str	r2, [r0]
 	RET
 	LMEMCPY_4_PAD
 
 /*
  * 0001: dst is 32-bit aligned, src is 8-bit aligned
  */
 	ldr	r3, [r1, #-1]		/* BE:r3 = x012  LE:r3 = 210x */
 	ldr	r2, [r1, #3]		/* BE:r2 = 3xxx  LE:r2 = xxx3 */
-#ifdef __ARMEB__
-	mov	r3, r3, lsl #8		/* r3 = 012. */
-	orr	r3, r3, r2, lsr #24	/* r3 = 0123 */
-#else
 	mov	r3, r3, lsr #8		/* r3 = .210 */
 	orr	r3, r3, r2, lsl #24	/* r3 = 3210 */
-#endif
 	str	r3, [r0]
 	RET
 	LMEMCPY_4_PAD
 
 /*
  * 0010: dst is 32-bit aligned, src is 16-bit aligned
  */
-#ifdef __ARMEB__
-	ldrh	r3, [r1]
-	ldrh	r2, [r1, #0x02]
-#else
 	ldrh	r3, [r1, #0x02]
 	ldrh	r2, [r1]
-#endif
 	orr	r3, r2, r3, lsl #16
 	str	r3, [r0]
 	RET
 	LMEMCPY_4_PAD
 
 /*
  * 0011: dst is 32-bit aligned, src is 8-bit aligned
  */
 	ldr	r3, [r1, #-3]		/* BE:r3 = xxx0  LE:r3 = 0xxx */
 	ldr	r2, [r1, #1]		/* BE:r2 = 123x  LE:r2 = x321 */
-#ifdef __ARMEB__
-	mov	r3, r3, lsl #24		/* r3 = 0... */
-	orr	r3, r3, r2, lsr #8	/* r3 = 0123 */
-#else
 	mov	r3, r3, lsr #24		/* r3 = ...0 */
 	orr	r3, r3, r2, lsl #8	/* r3 = 3210 */
-#endif
 	str	r3, [r0]
 	RET
 	LMEMCPY_4_PAD
 
 /*
  * 0100: dst is 8-bit aligned, src is 32-bit aligned
  */
 	ldr	r2, [r1]
-#ifdef __ARMEB__
-	strb	r2, [r0, #0x03]
-	mov	r3, r2, lsr #8
-	mov	r1, r2, lsr #24
-	strb	r1, [r0]
-#else
 	strb	r2, [r0]
 	mov	r3, r2, lsr #8
 	mov	r1, r2, lsr #24
 	strb	r1, [r0, #0x03]
-#endif
 	strh	r3, [r0, #0x01]
 	RET
 	LMEMCPY_4_PAD
 
 /*
  * 0101: dst is 8-bit aligned, src is 8-bit aligned
  */
 	ldrb	r2, [r1]
 	ldrh	r3, [r1, #0x01]
 	ldrb	r1, [r1, #0x03]
 	strb	r2, [r0]
 	strh	r3, [r0, #0x01]
 	strb	r1, [r0, #0x03]
 	RET
 	LMEMCPY_4_PAD
 
 /*
  * 0110: dst is 8-bit aligned, src is 16-bit aligned
  */
 	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
 	ldrh	r3, [r1, #0x02]		/* LE:r3 = ..23  LE:r3 = ..32 */
-#ifdef __ARMEB__
 	mov	r1, r2, lsr #8		/* r1 = ...0 */
 	strb	r1, [r0]
 	mov	r2, r2, lsl #8		/* r2 = .01. */
 	orr	r2, r2, r3, lsr #8	/* r2 = .012 */
-#else
-	strb	r2, [r0]
-	mov	r2, r2, lsr #8		/* r2 = ...1 */
-	orr	r2, r2, r3, lsl #8	/* r2 = .321 */
-	mov	r3, r3, lsr #8		/* r3 = ...3 */
-#endif
 	strh	r2, [r0, #0x01]
 	strb	r3, [r0, #0x03]
 	RET
 	LMEMCPY_4_PAD
 
 /*
  * 0111: dst is 8-bit aligned, src is 8-bit aligned
  */
 	ldrb	r2, [r1]
 	ldrh	r3, [r1, #0x01]
 	ldrb	r1, [r1, #0x03]
 	strb	r2, [r0]
 	strh	r3, [r0, #0x01]
 	strb	r1, [r0, #0x03]
 	RET
 	LMEMCPY_4_PAD
 
 /*
  * 1000: dst is 16-bit aligned, src is 32-bit aligned
  */
 	ldr	r2, [r1]
-#ifdef __ARMEB__
-	strh	r2, [r0, #0x02]
-	mov	r3, r2, lsr #16
-	strh	r3, [r0]
-#else
 	strh	r2, [r0]
 	mov	r3, r2, lsr #16
 	strh	r3, [r0, #0x02]
-#endif
 	RET
 	LMEMCPY_4_PAD
 
 /*
  * 1001: dst is 16-bit aligned, src is 8-bit aligned
  */
 	ldr	r2, [r1, #-1]		/* BE:r2 = x012  LE:r2 = 210x */
 	ldr	r3, [r1, #3]		/* BE:r3 = 3xxx  LE:r3 = xxx3 */
 	mov	r1, r2, lsr #8		/* BE:r1 = .x01  LE:r1 = .210 */
 	strh	r1, [r0]
-#ifdef __ARMEB__
-	mov	r2, r2, lsl #8		/* r2 = 012. */
-	orr	r2, r2, r3, lsr #24	/* r2 = 0123 */
-#else
 	mov	r2, r2, lsr #24		/* r2 = ...2 */
 	orr	r2, r2, r3, lsl #8	/* r2 = xx32 */
-#endif
 	strh	r2, [r0, #0x02]
 	RET
 	LMEMCPY_4_PAD
 
 /*
  * 1010: dst is 16-bit aligned, src is 16-bit aligned
  */
 	ldrh	r2, [r1]
 	ldrh	r3, [r1, #0x02]
 	strh	r2, [r0]
 	strh	r3, [r0, #0x02]
 	RET
 	LMEMCPY_4_PAD
 
 /*
  * 1011: dst is 16-bit aligned, src is 8-bit aligned
  */
 	ldr	r3, [r1, #1]		/* BE:r3 = 123x  LE:r3 = x321 */
 	ldr	r2, [r1, #-3]		/* BE:r2 = xxx0  LE:r2 = 0xxx */
 	mov	r1, r3, lsr #8		/* BE:r1 = .123  LE:r1 = .x32 */
 	strh	r1, [r0, #0x02]
-#ifdef __ARMEB__
-	mov	r3, r3, lsr #24		/* r3 = ...1 */
-	orr	r3, r3, r2, lsl #8	/* r3 = xx01 */
-#else
 	mov	r3, r3, lsl #8		/* r3 = 321. */
 	orr	r3, r3, r2, lsr #24	/* r3 = 3210 */
-#endif
 	strh	r3, [r0]
 	RET
 	LMEMCPY_4_PAD
 
 /*
  * 1100: dst is 8-bit aligned, src is 32-bit aligned
  */
 	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
-#ifdef __ARMEB__
-	strb	r2, [r0, #0x03]
-	mov	r3, r2, lsr #8
-	mov	r1, r2, lsr #24
-	strh	r3, [r0, #0x01]
-	strb	r1, [r0]
-#else
 	strb	r2, [r0]
 	mov	r3, r2, lsr #8
 	mov	r1, r2, lsr #24
 	strh	r3, [r0, #0x01]
 	strb	r1, [r0, #0x03]
-#endif
 	RET
 	LMEMCPY_4_PAD
 
 /*
  * 1101: dst is 8-bit aligned, src is 8-bit aligned
  */
 	ldrb	r2, [r1]
 	ldrh	r3, [r1, #0x01]
 	ldrb	r1, [r1, #0x03]
 	strb	r2, [r0]
 	strh	r3, [r0, #0x01]
 	strb	r1, [r0, #0x03]
 	RET
 	LMEMCPY_4_PAD
 
 /*
  * 1110: dst is 8-bit aligned, src is 16-bit aligned
  */
-#ifdef __ARMEB__
-	ldrh	r3, [r1, #0x02]		/* BE:r3 = ..23  LE:r3 = ..32 */
 	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
-	strb	r3, [r0, #0x03]
-	mov	r3, r3, lsr #8		/* r3 = ...2 */
-	orr	r3, r3, r2, lsl #8	/* r3 = ..12 */
-	strh	r3, [r0, #0x01]
-	mov	r2, r2, lsr #8		/* r2 = ...0 */
-	strb	r2, [r0]
-#else
-	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
 	ldrh	r3, [r1, #0x02]		/* BE:r3 = ..23  LE:r3 = ..32 */
 	strb	r2, [r0]
 	mov	r2, r2, lsr #8		/* r2 = ...1 */
 	orr	r2, r2, r3, lsl #8	/* r2 = .321 */
 	strh	r2, [r0, #0x01]
 	mov	r3, r3, lsr #8		/* r3 = ...3 */
 	strb	r3, [r0, #0x03]
-#endif
 	RET
 	LMEMCPY_4_PAD
 
 /*
  * 1111: dst is 8-bit aligned, src is 8-bit aligned
  */
 	ldrb	r2, [r1]
 	ldrh	r3, [r1, #0x01]
 	ldrb	r1, [r1, #0x03]
 	strb	r2, [r0]
 	strh	r3, [r0, #0x01]
 	strb	r1, [r0, #0x03]
 	RET
 	LMEMCPY_4_PAD
 
 
 /******************************************************************************
  * Special case for 6 byte copies
  */
 #define	LMEMCPY_6_LOG2	6	/* 64 bytes */
 #define	LMEMCPY_6_PAD	.align LMEMCPY_6_LOG2
 	LMEMCPY_6_PAD
 .Lmemcpy_6:
 	and	r2, r1, #0x03
 	orr	r2, r2, r0, lsl #2
 	ands	r2, r2, #0x0f
 	sub	r3, pc, #0x14
 	addne	pc, r3, r2, lsl #LMEMCPY_6_LOG2
 
 /*
  * 0000: dst is 32-bit aligned, src is 32-bit aligned
  */
 	ldr	r2, [r1]
 	ldrh	r3, [r1, #0x04]
 	str	r2, [r0]
 	strh	r3, [r0, #0x04]
 	RET
 	LMEMCPY_6_PAD
 
 /*
  * 0001: dst is 32-bit aligned, src is 8-bit aligned
  */
 	ldr	r2, [r1, #-1]		/* BE:r2 = x012  LE:r2 = 210x */
 	ldr	r3, [r1, #0x03]		/* BE:r3 = 345x  LE:r3 = x543 */
-#ifdef __ARMEB__
-	mov	r2, r2, lsl #8		/* r2 = 012. */
-	orr	r2, r2, r3, lsr #24	/* r2 = 0123 */
-#else
 	mov	r2, r2, lsr #8		/* r2 = .210 */
 	orr	r2, r2, r3, lsl #24	/* r2 = 3210 */
-#endif
 	mov	r3, r3, lsr #8		/* BE:r3 = .345  LE:r3 = .x54 */
 	str	r2, [r0]
 	strh	r3, [r0, #0x04]
 	RET
 	LMEMCPY_6_PAD
 
 /*
  * 0010: dst is 32-bit aligned, src is 16-bit aligned
  */
 	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
 	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
-#ifdef __ARMEB__
-	mov	r1, r3, lsr #16		/* r1 = ..23 */
-	orr	r1, r1, r2, lsl #16	/* r1 = 0123 */
-	str	r1, [r0]
-	strh	r3, [r0, #0x04]
-#else
 	mov	r1, r3, lsr #16		/* r1 = ..54 */
 	orr	r2, r2, r3, lsl #16	/* r2 = 3210 */
 	str	r2, [r0]
 	strh	r1, [r0, #0x04]
-#endif
 	RET
 	LMEMCPY_6_PAD
 
 /*
  * 0011: dst is 32-bit aligned, src is 8-bit aligned
  */
 	ldr	r2, [r1, #-3]		/* BE:r2 = xxx0  LE:r2 = 0xxx */
 	ldr	r3, [r1, #1]		/* BE:r3 = 1234  LE:r3 = 4321 */
 	ldr	r1, [r1, #5]		/* BE:r1 = 5xxx  LE:r3 = xxx5 */
-#ifdef __ARMEB__
-	mov	r2, r2, lsl #24		/* r2 = 0... */
-	orr	r2, r2, r3, lsr #8	/* r2 = 0123 */
-	mov	r3, r3, lsl #8		/* r3 = 234. */
-	orr	r1, r3, r1, lsr #24	/* r1 = 2345 */
-#else
 	mov	r2, r2, lsr #24		/* r2 = ...0 */
 	orr	r2, r2, r3, lsl #8	/* r2 = 3210 */
 	mov	r1, r1, lsl #8		/* r1 = xx5. */
 	orr	r1, r1, r3, lsr #24	/* r1 = xx54 */
-#endif
 	str	r2, [r0]
 	strh	r1, [r0, #0x04]
 	RET
 	LMEMCPY_6_PAD
 
 /*
  * 0100: dst is 8-bit aligned, src is 32-bit aligned
  */
 	ldr	r3, [r1]		/* BE:r3 = 0123  LE:r3 = 3210 */
 	ldrh	r2, [r1, #0x04]		/* BE:r2 = ..45  LE:r2 = ..54 */
 	mov	r1, r3, lsr #8		/* BE:r1 = .012  LE:r1 = .321 */
 	strh	r1, [r0, #0x01]
-#ifdef __ARMEB__
-	mov	r1, r3, lsr #24		/* r1 = ...0 */
-	strb	r1, [r0]
-	mov	r3, r3, lsl #8		/* r3 = 123. */
-	orr	r3, r3, r2, lsr #8	/* r3 = 1234 */
-#else
 	strb	r3, [r0]
 	mov	r3, r3, lsr #24		/* r3 = ...3 */
 	orr	r3, r3, r2, lsl #8	/* r3 = .543 */
 	mov	r2, r2, lsr #8		/* r2 = ...5 */
-#endif
 	strh	r3, [r0, #0x03]
 	strb	r2, [r0, #0x05]
 	RET
 	LMEMCPY_6_PAD
 
 /*
  * 0101: dst is 8-bit aligned, src is 8-bit aligned
  */
 	ldrb	r2, [r1]
 	ldrh	r3, [r1, #0x01]
 	ldrh	ip, [r1, #0x03]
 	ldrb	r1, [r1, #0x05]
 	strb	r2, [r0]
 	strh	r3, [r0, #0x01]
 	strh	ip, [r0, #0x03]
 	strb	r1, [r0, #0x05]
 	RET
 	LMEMCPY_6_PAD
 
 /*
  * 0110: dst is 8-bit aligned, src is 16-bit aligned
  */
 	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
 	ldr	r1, [r1, #0x02]		/* BE:r1 = 2345  LE:r1 = 5432 */
-#ifdef __ARMEB__
-	mov	r3, r2, lsr #8		/* r3 = ...0 */
-	strb	r3, [r0]
-	strb	r1, [r0, #0x05]
-	mov	r3, r1, lsr #8		/* r3 = .234 */
-	strh	r3, [r0, #0x03]
-	mov	r3, r2, lsl #8		/* r3 = .01. */
-	orr	r3, r3, r1, lsr #24	/* r3 = .012 */
-	strh	r3, [r0, #0x01]
-#else
 	strb	r2, [r0]
 	mov	r3, r1, lsr #24
 	strb	r3, [r0, #0x05]
 	mov	r3, r1, lsr #8		/* r3 = .543 */
 	strh	r3, [r0, #0x03]
 	mov	r3, r2, lsr #8		/* r3 = ...1 */
 	orr	r3, r3, r1, lsl #8	/* r3 = 4321 */
 	strh	r3, [r0, #0x01]
-#endif
 	RET
 	LMEMCPY_6_PAD
 
 /*
  * 0111: dst is 8-bit aligned, src is 8-bit aligned
  */
 	ldrb	r2, [r1]
 	ldrh	r3, [r1, #0x01]
 	ldrh	ip, [r1, #0x03]
 	ldrb	r1, [r1, #0x05]
 	strb	r2, [r0]
 	strh	r3, [r0, #0x01]
 	strh	ip, [r0, #0x03]
 	strb	r1, [r0, #0x05]
 	RET
 	LMEMCPY_6_PAD
 
 /*
  * 1000: dst is 16-bit aligned, src is 32-bit aligned
  */
-#ifdef __ARMEB__
-	ldr	r2, [r1]		/* r2 = 0123 */
-	ldrh	r3, [r1, #0x04]		/* r3 = ..45 */
-	mov	r1, r2, lsr #16		/* r1 = ..01 */
-	orr	r3, r3, r2, lsl#16	/* r3 = 2345 */
-	strh	r1, [r0]
-	str	r3, [r0, #0x02]
-#else
 	ldrh	r2, [r1, #0x04]		/* r2 = ..54 */
 	ldr	r3, [r1]		/* r3 = 3210 */
 	mov	r2, r2, lsl #16		/* r2 = 54.. */
 	orr	r2, r2, r3, lsr #16	/* r2 = 5432 */
 	strh	r3, [r0]
 	str	r2, [r0, #0x02]
-#endif
 	RET
 	LMEMCPY_6_PAD
 
 /*
  * 1001: dst is 16-bit aligned, src is 8-bit aligned
  */
 	ldr	r3, [r1, #-1]		/* BE:r3 = x012  LE:r3 = 210x */
 	ldr	r2, [r1, #3]		/* BE:r2 = 345x  LE:r2 = x543 */
 	mov	r1, r3, lsr #8		/* BE:r1 = .x01  LE:r1 = .210 */
-#ifdef __ARMEB__
-	mov	r2, r2, lsr #8		/* r2 = .345 */
-	orr	r2, r2, r3, lsl #24	/* r2 = 2345 */
-#else
 	mov	r2, r2, lsl #8		/* r2 = 543. */
 	orr	r2, r2, r3, lsr #24	/* r2 = 5432 */
-#endif
 	strh	r1, [r0]
 	str	r2, [r0, #0x02]
 	RET
 	LMEMCPY_6_PAD
 
 /*
  * 1010: dst is 16-bit aligned, src is 16-bit aligned
  */
 	ldrh	r2, [r1]
 	ldr	r3, [r1, #0x02]
 	strh	r2, [r0]
 	str	r3, [r0, #0x02]
 	RET
 	LMEMCPY_6_PAD
 
 /*
  * 1011: dst is 16-bit aligned, src is 8-bit aligned
  */
 	ldrb	r3, [r1]		/* r3 = ...0 */
 	ldr	r2, [r1, #0x01]		/* BE:r2 = 1234  LE:r2 = 4321 */
 	ldrb	r1, [r1, #0x05]		/* r1 = ...5 */
-#ifdef __ARMEB__
-	mov	r3, r3, lsl #8		/* r3 = ..0. */
-	orr	r3, r3, r2, lsr #24	/* r3 = ..01 */
-	orr	r1, r1, r2, lsl #8	/* r1 = 2345 */
-#else
 	orr	r3, r3, r2, lsl #8	/* r3 = 3210 */
 	mov	r1, r1, lsl #24		/* r1 = 5... */
 	orr	r1, r1, r2, lsr #8	/* r1 = 5432 */
-#endif
 	strh	r3, [r0]
 	str	r1, [r0, #0x02]
 	RET
 	LMEMCPY_6_PAD
 
 /*
  * 1100: dst is 8-bit aligned, src is 32-bit aligned
  */
 	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
 	ldrh	r1, [r1, #0x04]		/* BE:r1 = ..45  LE:r1 = ..54 */
-#ifdef __ARMEB__
-	mov	r3, r2, lsr #24		/* r3 = ...0 */
-	strb	r3, [r0]
-	mov	r2, r2, lsl #8		/* r2 = 123. */
-	orr	r2, r2, r1, lsr #8	/* r2 = 1234 */
-#else
 	strb	r2, [r0]
 	mov	r2, r2, lsr #8		/* r2 = .321 */
 	orr	r2, r2, r1, lsl #24	/* r2 = 4321 */
 	mov	r1, r1, lsr #8		/* r1 = ...5 */
-#endif
 	str	r2, [r0, #0x01]
 	strb	r1, [r0, #0x05]
 	RET
 	LMEMCPY_6_PAD
 
 /*
  * 1101: dst is 8-bit aligned, src is 8-bit aligned
  */
 	ldrb	r2, [r1]
 	ldrh	r3, [r1, #0x01]
 	ldrh	ip, [r1, #0x03]
 	ldrb	r1, [r1, #0x05]
 	strb	r2, [r0]
 	strh	r3, [r0, #0x01]
 	strh	ip, [r0, #0x03]
 	strb	r1, [r0, #0x05]
 	RET
 	LMEMCPY_6_PAD
 
 /*
  * 1110: dst is 8-bit aligned, src is 16-bit aligned
  */
 	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
 	ldr	r1, [r1, #0x02]		/* BE:r1 = 2345  LE:r1 = 5432 */
-#ifdef __ARMEB__
-	mov	r3, r2, lsr #8		/* r3 = ...0 */
-	strb	r3, [r0]
-	mov	r2, r2, lsl #24		/* r2 = 1... */
-	orr	r2, r2, r1, lsr #8	/* r2 = 1234 */
-#else
 	strb	r2, [r0]
 	mov	r2, r2, lsr #8		/* r2 = ...1 */
 	orr	r2, r2, r1, lsl #8	/* r2 = 4321 */
 	mov	r1, r1, lsr #24		/* r1 = ...5 */
-#endif
 	str	r2, [r0, #0x01]
 	strb	r1, [r0, #0x05]
 	RET
 	LMEMCPY_6_PAD
 
 /*
  * 1111: dst is 8-bit aligned, src is 8-bit aligned
  */
 	ldrb	r2, [r1]
 	ldr	r3, [r1, #0x01]
 	ldrb	r1, [r1, #0x05]
 	strb	r2, [r0]
 	str	r3, [r0, #0x01]
 	strb	r1, [r0, #0x05]
 	RET
 	LMEMCPY_6_PAD
 
 
 /******************************************************************************
  * Special case for 8 byte copies
  */
 #define	LMEMCPY_8_LOG2	6	/* 64 bytes */
 #define	LMEMCPY_8_PAD	.align LMEMCPY_8_LOG2
 	LMEMCPY_8_PAD
 .Lmemcpy_8:
 	and	r2, r1, #0x03
 	orr	r2, r2, r0, lsl #2
 	ands	r2, r2, #0x0f
 	sub	r3, pc, #0x14
 	addne	pc, r3, r2, lsl #LMEMCPY_8_LOG2
 
 /*
  * 0000: dst is 32-bit aligned, src is 32-bit aligned
  */
 	ldr	r2, [r1]
 	ldr	r3, [r1, #0x04]
 	str	r2, [r0]
 	str	r3, [r0, #0x04]
 	RET
 	LMEMCPY_8_PAD
 
 /*
  * 0001: dst is 32-bit aligned, src is 8-bit aligned
  */
 	ldr	r3, [r1, #-1]		/* BE:r3 = x012  LE:r3 = 210x */
 	ldr	r2, [r1, #0x03]		/* BE:r2 = 3456  LE:r2 = 6543 */
 	ldrb	r1, [r1, #0x07]		/* r1 = ...7 */
-#ifdef __ARMEB__
-	mov	r3, r3, lsl #8		/* r3 = 012. */
-	orr	r3, r3, r2, lsr #24	/* r3 = 0123 */
-	orr	r2, r1, r2, lsl #8	/* r2 = 4567 */
-#else
 	mov	r3, r3, lsr #8		/* r3 = .210 */
 	orr	r3, r3, r2, lsl #24	/* r3 = 3210 */
 	mov	r1, r1, lsl #24		/* r1 = 7... */
 	orr	r2, r1, r2, lsr #8	/* r2 = 7654 */
-#endif
 	str	r3, [r0]
 	str	r2, [r0, #0x04]
 	RET
 	LMEMCPY_8_PAD
 
 /*
  * 0010: dst is 32-bit aligned, src is 16-bit aligned
  */
 	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
 	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
 	ldrh	r1, [r1, #0x06]		/* BE:r1 = ..67  LE:r1 = ..76 */
-#ifdef __ARMEB__
-	mov	r2, r2, lsl #16		/* r2 = 01.. */
-	orr	r2, r2, r3, lsr #16	/* r2 = 0123 */
-	orr	r3, r1, r3, lsl #16	/* r3 = 4567 */
-#else
 	orr	r2, r2, r3, lsl #16	/* r2 = 3210 */
 	mov	r3, r3, lsr #16		/* r3 = ..54 */
 	orr	r3, r3, r1, lsl #16	/* r3 = 7654 */
-#endif
 	str	r2, [r0]
 	str	r3, [r0, #0x04]
 	RET
 	LMEMCPY_8_PAD
 
 /*
  * 0011: dst is 32-bit aligned, src is 8-bit aligned
  */
 	ldrb	r3, [r1]		/* r3 = ...0 */
 	ldr	r2, [r1, #0x01]		/* BE:r2 = 1234  LE:r2 = 4321 */
 	ldr	r1, [r1, #0x05]		/* BE:r1 = 567x  LE:r1 = x765 */
-#ifdef __ARMEB__
-	mov	r3, r3, lsl #24		/* r3 = 0... */
-	orr	r3, r3, r2, lsr #8	/* r3 = 0123 */
-	mov	r2, r2, lsl #24		/* r2 = 4... */
-	orr	r2, r2, r1, lsr #8	/* r2 = 4567 */
-#else
 	orr	r3, r3, r2, lsl #8	/* r3 = 3210 */
 	mov	r2, r2, lsr #24		/* r2 = ...4 */
 	orr	r2, r2, r1, lsl #8	/* r2 = 7654 */
-#endif
 	str	r3, [r0]
 	str	r2, [r0, #0x04]
 	RET
 	LMEMCPY_8_PAD
 
 /*
  * 0100: dst is 8-bit aligned, src is 32-bit aligned
  */
 	ldr	r3, [r1]		/* BE:r3 = 0123  LE:r3 = 3210 */
 	ldr	r2, [r1, #0x04]		/* BE:r2 = 4567  LE:r2 = 7654 */
-#ifdef __ARMEB__
-	mov	r1, r3, lsr #24		/* r1 = ...0 */
-	strb	r1, [r0]
-	mov	r1, r3, lsr #8		/* r1 = .012 */
-	strb	r2, [r0, #0x07]
-	mov	r3, r3, lsl #24		/* r3 = 3... */
-	orr	r3, r3, r2, lsr #8	/* r3 = 3456 */
-#else
 	strb	r3, [r0]
 	mov	r1, r2, lsr #24		/* r1 = ...7 */
 	strb	r1, [r0, #0x07]
 	mov	r1, r3, lsr #8		/* r1 = .321 */
 	mov	r3, r3, lsr #24		/* r3 = ...3 */
 	orr	r3, r3, r2, lsl #8	/* r3 = 6543 */
-#endif
 	strh	r1, [r0, #0x01]
 	str	r3, [r0, #0x03]
 	RET
 	LMEMCPY_8_PAD
 
 /*
  * 0101: dst is 8-bit aligned, src is 8-bit aligned
  */
 	ldrb	r2, [r1]
 	ldrh	r3, [r1, #0x01]
 	ldr	ip, [r1, #0x03]
 	ldrb	r1, [r1, #0x07]
 	strb	r2, [r0]
 	strh	r3, [r0, #0x01]
 	str	ip, [r0, #0x03]
 	strb	r1, [r0, #0x07]
 	RET
 	LMEMCPY_8_PAD
 
 /*
  * 0110: dst is 8-bit aligned, src is 16-bit aligned
  */
 	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
 	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
 	ldrh	r1, [r1, #0x06]		/* BE:r1 = ..67  LE:r1 = ..76 */
-#ifdef __ARMEB__
-	mov	ip, r2, lsr #8		/* ip = ...0 */
-	strb	ip, [r0]
-	mov	ip, r2, lsl #8		/* ip = .01. */
-	orr	ip, ip, r3, lsr #24	/* ip = .012 */
-	strb	r1, [r0, #0x07]
-	mov	r3, r3, lsl #8		/* r3 = 345. */
-	orr	r3, r3, r1, lsr #8	/* r3 = 3456 */
-#else
 	strb	r2, [r0]		/* 0 */
 	mov	ip, r1, lsr #8		/* ip = ...7 */
 	strb	ip, [r0, #0x07]		/* 7 */
 	mov	ip, r2, lsr #8		/* ip = ...1 */
 	orr	ip, ip, r3, lsl #8	/* ip = 4321 */
 	mov	r3, r3, lsr #8		/* r3 = .543 */
 	orr	r3, r3, r1, lsl #24	/* r3 = 6543 */
-#endif
 	strh	ip, [r0, #0x01]
 	str	r3, [r0, #0x03]
 	RET
 	LMEMCPY_8_PAD
 
 /*
  * 0111: dst is 8-bit aligned, src is 8-bit aligned
  */
 	ldrb	r3, [r1]		/* r3 = ...0 */
 	ldr	ip, [r1, #0x01]		/* BE:ip = 1234  LE:ip = 4321 */
 	ldrh	r2, [r1, #0x05]		/* BE:r2 = ..56  LE:r2 = ..65 */
 	ldrb	r1, [r1, #0x07]		/* r1 = ...7 */
 	strb	r3, [r0]
 	mov	r3, ip, lsr #16		/* BE:r3 = ..12  LE:r3 = ..43 */
-#ifdef __ARMEB__
-	strh	r3, [r0, #0x01]
-	orr	r2, r2, ip, lsl #16	/* r2 = 3456 */
-#else
 	strh	ip, [r0, #0x01]
 	orr	r2, r3, r2, lsl #16	/* r2 = 6543 */
-#endif
 	str	r2, [r0, #0x03]
 	strb	r1, [r0, #0x07]
 	RET
 	LMEMCPY_8_PAD
 
 /*
  * 1000: dst is 16-bit aligned, src is 32-bit aligned
  */
 	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
 	ldr	r3, [r1, #0x04]		/* BE:r3 = 4567  LE:r3 = 7654 */
 	mov	r1, r2, lsr #16		/* BE:r1 = ..01  LE:r1 = ..32 */
-#ifdef __ARMEB__
-	strh	r1, [r0]
-	mov	r1, r3, lsr #16		/* r1 = ..45 */
-	orr	r2, r1 ,r2, lsl #16	/* r2 = 2345 */
-#else
 	strh	r2, [r0]
 	orr	r2, r1, r3, lsl #16	/* r2 = 5432 */
 	mov	r3, r3, lsr #16		/* r3 = ..76 */
-#endif
 	str	r2, [r0, #0x02]
 	strh	r3, [r0, #0x06]
 	RET
 	LMEMCPY_8_PAD
 
 /*
  * 1001: dst is 16-bit aligned, src is 8-bit aligned
  */
 	ldr	r2, [r1, #-1]		/* BE:r2 = x012  LE:r2 = 210x */
 	ldr	r3, [r1, #0x03]		/* BE:r3 = 3456  LE:r3 = 6543 */
 	ldrb	ip, [r1, #0x07]		/* ip = ...7 */
 	mov	r1, r2, lsr #8		/* BE:r1 = .x01  LE:r1 = .210 */
 	strh	r1, [r0]
-#ifdef __ARMEB__
-	mov	r1, r2, lsl #24		/* r1 = 2... */
-	orr	r1, r1, r3, lsr #8	/* r1 = 2345 */
-	orr	r3, ip, r3, lsl #8	/* r3 = 4567 */
-#else
 	mov	r1, r2, lsr #24		/* r1 = ...2 */
 	orr	r1, r1, r3, lsl #8	/* r1 = 5432 */
 	mov	r3, r3, lsr #24		/* r3 = ...6 */
 	orr	r3, r3, ip, lsl #8	/* r3 = ..76 */
-#endif
 	str	r1, [r0, #0x02]
 	strh	r3, [r0, #0x06]
 	RET
 	LMEMCPY_8_PAD
 
 /*
  * 1010: dst is 16-bit aligned, src is 16-bit aligned
  */
 	ldrh	r2, [r1]
 	ldr	ip, [r1, #0x02]
 	ldrh	r3, [r1, #0x06]
 	strh	r2, [r0]
 	str	ip, [r0, #0x02]
 	strh	r3, [r0, #0x06]
 	RET
 	LMEMCPY_8_PAD
 
 /*
  * 1011: dst is 16-bit aligned, src is 8-bit aligned
  */
 	ldr	r3, [r1, #0x05]		/* BE:r3 = 567x  LE:r3 = x765 */
 	ldr	r2, [r1, #0x01]		/* BE:r2 = 1234  LE:r2 = 4321 */
 	ldrb	ip, [r1]		/* ip = ...0 */
 	mov	r1, r3, lsr #8		/* BE:r1 = .567  LE:r1 = .x76 */
 	strh	r1, [r0, #0x06]
-#ifdef __ARMEB__
-	mov	r3, r3, lsr #24		/* r3 = ...5 */
-	orr	r3, r3, r2, lsl #8	/* r3 = 2345 */
-	mov	r2, r2, lsr #24		/* r2 = ...1 */
-	orr	r2, r2, ip, lsl #8	/* r2 = ..01 */
-#else
 	mov	r3, r3, lsl #24		/* r3 = 5... */
 	orr	r3, r3, r2, lsr #8	/* r3 = 5432 */
 	orr	r2, ip, r2, lsl #8	/* r2 = 3210 */
-#endif
 	str	r3, [r0, #0x02]
 	strh	r2, [r0]
 	RET
 	LMEMCPY_8_PAD
 
 /*
  * 1100: dst is 8-bit aligned, src is 32-bit aligned
  */
 	ldr	r3, [r1, #0x04]		/* BE:r3 = 4567  LE:r3 = 7654 */
 	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
 	mov	r1, r3, lsr #8		/* BE:r1 = .456  LE:r1 = .765 */
 	strh	r1, [r0, #0x05]
-#ifdef __ARMEB__
-	strb	r3, [r0, #0x07]
-	mov	r1, r2, lsr #24		/* r1 = ...0 */
-	strb	r1, [r0]
-	mov	r2, r2, lsl #8		/* r2 = 123. */
-	orr	r2, r2, r3, lsr #24	/* r2 = 1234 */
-	str	r2, [r0, #0x01]
-#else
 	strb	r2, [r0]
 	mov	r1, r3, lsr #24		/* r1 = ...7 */
 	strb	r1, [r0, #0x07]
 	mov	r2, r2, lsr #8		/* r2 = .321 */
 	orr	r2, r2, r3, lsl #24	/* r2 = 4321 */
 	str	r2, [r0, #0x01]
-#endif
 	RET
 	LMEMCPY_8_PAD
 
 /*
  * 1101: dst is 8-bit aligned, src is 8-bit aligned
  */
 	ldrb	r3, [r1]		/* r3 = ...0 */
 	ldrh	r2, [r1, #0x01]		/* BE:r2 = ..12  LE:r2 = ..21 */
 	ldr	ip, [r1, #0x03]		/* BE:ip = 3456  LE:ip = 6543 */
 	ldrb	r1, [r1, #0x07]		/* r1 = ...7 */
 	strb	r3, [r0]
 	mov	r3, ip, lsr #16		/* BE:r3 = ..34  LE:r3 = ..65 */
-#ifdef __ARMEB__
-	strh	ip, [r0, #0x05]
-	orr	r2, r3, r2, lsl #16	/* r2 = 1234 */
-#else
 	strh	r3, [r0, #0x05]
 	orr	r2, r2, ip, lsl #16	/* r2 = 4321 */
-#endif
 	str	r2, [r0, #0x01]
 	strb	r1, [r0, #0x07]
 	RET
 	LMEMCPY_8_PAD
 
 /*
  * 1110: dst is 8-bit aligned, src is 16-bit aligned
  */
 	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
 	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
 	ldrh	r1, [r1, #0x06]		/* BE:r1 = ..67  LE:r1 = ..76 */
-#ifdef __ARMEB__
-	mov	ip, r2, lsr #8		/* ip = ...0 */
-	strb	ip, [r0]
-	mov	ip, r2, lsl #24		/* ip = 1... */
-	orr	ip, ip, r3, lsr #8	/* ip = 1234 */
-	strb	r1, [r0, #0x07]
-	mov	r1, r1, lsr #8		/* r1 = ...6 */
-	orr	r1, r1, r3, lsl #8	/* r1 = 3456 */
-#else
 	strb	r2, [r0]
 	mov	ip, r2, lsr #8		/* ip = ...1 */
 	orr	ip, ip, r3, lsl #8	/* ip = 4321 */
 	mov	r2, r1, lsr #8		/* r2 = ...7 */
 	strb	r2, [r0, #0x07]
 	mov	r1, r1, lsl #8		/* r1 = .76. */
 	orr	r1, r1, r3, lsr #24	/* r1 = .765 */
-#endif
 	str	ip, [r0, #0x01]
 	strh	r1, [r0, #0x05]
 	RET
 	LMEMCPY_8_PAD
 
 /*
  * 1111: dst is 8-bit aligned, src is 8-bit aligned
  */
 	ldrb	r2, [r1]
 	ldr	ip, [r1, #0x01]
 	ldrh	r3, [r1, #0x05]
 	ldrb	r1, [r1, #0x07]
 	strb	r2, [r0]
 	str	ip, [r0, #0x01]
 	strh	r3, [r0, #0x05]
 	strb	r1, [r0, #0x07]
 	RET
 	LMEMCPY_8_PAD
 
 /******************************************************************************
  * Special case for 12 byte copies
  */
 #define	LMEMCPY_C_LOG2	7	/* 128 bytes */
 #define	LMEMCPY_C_PAD	.align LMEMCPY_C_LOG2
 	LMEMCPY_C_PAD
 .Lmemcpy_c:
 	and	r2, r1, #0x03
 	orr	r2, r2, r0, lsl #2
 	ands	r2, r2, #0x0f
 	sub	r3, pc, #0x14
 	addne	pc, r3, r2, lsl #LMEMCPY_C_LOG2
 
 /*
  * 0000: dst is 32-bit aligned, src is 32-bit aligned
  */
 	ldr	r2, [r1]
 	ldr	r3, [r1, #0x04]
 	ldr	r1, [r1, #0x08]
 	str	r2, [r0]
 	str	r3, [r0, #0x04]
 	str	r1, [r0, #0x08]
 	RET
 	LMEMCPY_C_PAD
 
 /*
  * 0001: dst is 32-bit aligned, src is 8-bit aligned
  */
 	ldrb	r2, [r1, #0xb]		/* r2 = ...B */
 	ldr	ip, [r1, #0x07]		/* BE:ip = 789A  LE:ip = A987 */
 	ldr	r3, [r1, #0x03]		/* BE:r3 = 3456  LE:r3 = 6543 */
 	ldr	r1, [r1, #-1]		/* BE:r1 = x012  LE:r1 = 210x */
-#ifdef __ARMEB__
-	orr	r2, r2, ip, lsl #8	/* r2 = 89AB */
-	str	r2, [r0, #0x08]
-	mov	r2, ip, lsr #24		/* r2 = ...7 */
-	orr	r2, r2, r3, lsl #8	/* r2 = 4567 */
-	mov	r1, r1, lsl #8		/* r1 = 012. */
-	orr	r1, r1, r3, lsr #24	/* r1 = 0123 */
-#else
 	mov	r2, r2, lsl #24		/* r2 = B... */
 	orr	r2, r2, ip, lsr #8	/* r2 = BA98 */
 	str	r2, [r0, #0x08]
 	mov	r2, ip, lsl #24		/* r2 = 7... */
 	orr	r2, r2, r3, lsr #8	/* r2 = 7654 */
 	mov	r1, r1, lsr #8		/* r1 = .210 */
 	orr	r1, r1, r3, lsl #24	/* r1 = 3210 */
-#endif
 	str	r2, [r0, #0x04]
 	str	r1, [r0]
 	RET
 	LMEMCPY_C_PAD
 
 /*
  * 0010: dst is 32-bit aligned, src is 16-bit aligned
  */
 	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
 	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
 	ldr	ip, [r1, #0x06]		/* BE:ip = 6789  LE:ip = 9876 */
 	ldrh	r1, [r1, #0x0a]		/* BE:r1 = ..AB  LE:r1 = ..BA */
-#ifdef __ARMEB__
-	mov	r2, r2, lsl #16		/* r2 = 01.. */
-	orr	r2, r2, r3, lsr #16	/* r2 = 0123 */
-	str	r2, [r0]
-	mov	r3, r3, lsl #16		/* r3 = 45.. */
-	orr	r3, r3, ip, lsr #16	/* r3 = 4567 */
-	orr	r1, r1, ip, lsl #16	/* r1 = 89AB */
-#else
 	orr	r2, r2, r3, lsl #16	/* r2 = 3210 */
 	str	r2, [r0]
 	mov	r3, r3, lsr #16		/* r3 = ..54 */
 	orr	r3, r3, ip, lsl #16	/* r3 = 7654 */
 	mov	r1, r1, lsl #16		/* r1 = BA.. */
 	orr	r1, r1, ip, lsr #16	/* r1 = BA98 */
-#endif
 	str	r3, [r0, #0x04]
 	str	r1, [r0, #0x08]
 	RET
 	LMEMCPY_C_PAD
 
 /*
  * 0011: dst is 32-bit aligned, src is 8-bit aligned
  */
 	ldrb	r2, [r1]		/* r2 = ...0 */
 	ldr	r3, [r1, #0x01]		/* BE:r3 = 1234  LE:r3 = 4321 */
 	ldr	ip, [r1, #0x05]		/* BE:ip = 5678  LE:ip = 8765 */
 	ldr	r1, [r1, #0x09]		/* BE:r1 = 9ABx  LE:r1 = xBA9 */
-#ifdef __ARMEB__
-	mov	r2, r2, lsl #24		/* r2 = 0... */
-	orr	r2, r2, r3, lsr #8	/* r2 = 0123 */
-	str	r2, [r0]
-	mov	r3, r3, lsl #24		/* r3 = 4... */
-	orr	r3, r3, ip, lsr #8	/* r3 = 4567 */
-	mov	r1, r1, lsr #8		/* r1 = .9AB */
-	orr	r1, r1, ip, lsl #24	/* r1 = 89AB */
-#else
 	orr	r2, r2, r3, lsl #8	/* r2 = 3210 */
 	str	r2, [r0]
 	mov	r3, r3, lsr #24		/* r3 = ...4 */
 	orr	r3, r3, ip, lsl #8	/* r3 = 7654 */
 	mov	r1, r1, lsl #8		/* r1 = BA9. */
 	orr	r1, r1, ip, lsr #24	/* r1 = BA98 */
-#endif
 	str	r3, [r0, #0x04]
 	str	r1, [r0, #0x08]
 	RET
 	LMEMCPY_C_PAD
 
 /*
  * 0100: dst is 8-bit aligned (byte 1), src is 32-bit aligned
  */
 	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
 	ldr	r3, [r1, #0x04]		/* BE:r3 = 4567  LE:r3 = 7654 */
 	ldr	ip, [r1, #0x08]		/* BE:ip = 89AB  LE:ip = BA98 */
 	mov	r1, r2, lsr #8		/* BE:r1 = .012  LE:r1 = .321 */
 	strh	r1, [r0, #0x01]
-#ifdef __ARMEB__
-	mov	r1, r2, lsr #24		/* r1 = ...0 */
-	strb	r1, [r0]
-	mov	r1, r2, lsl #24		/* r1 = 3... */
-	orr	r2, r1, r3, lsr #8	/* r1 = 3456 */
-	mov	r1, r3, lsl #24		/* r1 = 7... */
-	orr	r1, r1, ip, lsr #8	/* r1 = 789A */
-#else
 	strb	r2, [r0]
 	mov	r1, r2, lsr #24		/* r1 = ...3 */
 	orr	r2, r1, r3, lsl #8	/* r1 = 6543 */
 	mov	r1, r3, lsr #24		/* r1 = ...7 */
 	orr	r1, r1, ip, lsl #8	/* r1 = A987 */
 	mov	ip, ip, lsr #24		/* ip = ...B */
-#endif
 	str	r2, [r0, #0x03]
 	str	r1, [r0, #0x07]
 	strb	ip, [r0, #0x0b]
 	RET
 	LMEMCPY_C_PAD
 
 /*
  * 0101: dst is 8-bit aligned (byte 1), src is 8-bit aligned (byte 1)
  */
 	ldrb	r2, [r1]
 	ldrh	r3, [r1, #0x01]
 	ldr	ip, [r1, #0x03]
 	strb	r2, [r0]
 	ldr	r2, [r1, #0x07]
 	ldrb	r1, [r1, #0x0b]
 	strh	r3, [r0, #0x01]
 	str	ip, [r0, #0x03]
 	str	r2, [r0, #0x07]
 	strb	r1, [r0, #0x0b]
 	RET
 	LMEMCPY_C_PAD
 
 /*
  * 0110: dst is 8-bit aligned (byte 1), src is 16-bit aligned
  */
 	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
 	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
 	ldr	ip, [r1, #0x06]		/* BE:ip = 6789  LE:ip = 9876 */
 	ldrh	r1, [r1, #0x0a]		/* BE:r1 = ..AB  LE:r1 = ..BA */
-#ifdef __ARMEB__
-	mov	r2, r2, ror #8		/* r2 = 1..0 */
 	strb	r2, [r0]
-	mov	r2, r2, lsr #16		/* r2 = ..1. */
-	orr	r2, r2, r3, lsr #24	/* r2 = ..12 */
-	strh	r2, [r0, #0x01]
-	mov	r2, r3, lsl #8		/* r2 = 345. */
-	orr	r3, r2, ip, lsr #24	/* r3 = 3456 */
-	mov	r2, ip, lsl #8		/* r2 = 789. */
-	orr	r2, r2, r1, lsr #8	/* r2 = 789A */
-#else
-	strb	r2, [r0]
 	mov	r2, r2, lsr #8		/* r2 = ...1 */
 	orr	r2, r2, r3, lsl #8	/* r2 = 4321 */
 	strh	r2, [r0, #0x01]
 	mov	r2, r3, lsr #8		/* r2 = .543 */
 	orr	r3, r2, ip, lsl #24	/* r3 = 6543 */
 	mov	r2, ip, lsr #8		/* r2 = .987 */
 	orr	r2, r2, r1, lsl #24	/* r2 = A987 */
 	mov	r1, r1, lsr #8		/* r1 = ...B */
-#endif
 	str	r3, [r0, #0x03]
 	str	r2, [r0, #0x07]
 	strb	r1, [r0, #0x0b]
 	RET
 	LMEMCPY_C_PAD
 
 /*
  * 0111: dst is 8-bit aligned (byte 1), src is 8-bit aligned (byte 3)
  */
 	ldrb	r2, [r1]
 	ldr	r3, [r1, #0x01]		/* BE:r3 = 1234  LE:r3 = 4321 */
 	ldr	ip, [r1, #0x05]		/* BE:ip = 5678  LE:ip = 8765 */
 	ldr	r1, [r1, #0x09]		/* BE:r1 = 9ABx  LE:r1 = xBA9 */
 	strb	r2, [r0]
-#ifdef __ARMEB__
-	mov	r2, r3, lsr #16		/* r2 = ..12 */
-	strh	r2, [r0, #0x01]
-	mov	r3, r3, lsl #16		/* r3 = 34.. */
-	orr	r3, r3, ip, lsr #16	/* r3 = 3456 */
-	mov	ip, ip, lsl #16		/* ip = 78.. */
-	orr	ip, ip, r1, lsr #16	/* ip = 789A */
-	mov	r1, r1, lsr #8		/* r1 = .9AB */
-#else
 	strh	r3, [r0, #0x01]
 	mov	r3, r3, lsr #16		/* r3 = ..43 */
 	orr	r3, r3, ip, lsl #16	/* r3 = 6543 */
 	mov	ip, ip, lsr #16		/* ip = ..87 */
 	orr	ip, ip, r1, lsl #16	/* ip = A987 */
 	mov	r1, r1, lsr #16		/* r1 = ..xB */
-#endif
 	str	r3, [r0, #0x03]
 	str	ip, [r0, #0x07]
 	strb	r1, [r0, #0x0b]
 	RET
 	LMEMCPY_C_PAD
 
 /*
  * 1000: dst is 16-bit aligned, src is 32-bit aligned
  */
 	ldr	ip, [r1]		/* BE:ip = 0123  LE:ip = 3210 */
 	ldr	r3, [r1, #0x04]		/* BE:r3 = 4567  LE:r3 = 7654 */
 	ldr	r2, [r1, #0x08]		/* BE:r2 = 89AB  LE:r2 = BA98 */
 	mov	r1, ip, lsr #16		/* BE:r1 = ..01  LE:r1 = ..32 */
-#ifdef __ARMEB__
-	strh	r1, [r0]
-	mov	r1, ip, lsl #16		/* r1 = 23.. */
-	orr	r1, r1, r3, lsr #16	/* r1 = 2345 */
-	mov	r3, r3, lsl #16		/* r3 = 67.. */
-	orr	r3, r3, r2, lsr #16	/* r3 = 6789 */
-#else
 	strh	ip, [r0]
 	orr	r1, r1, r3, lsl #16	/* r1 = 5432 */
 	mov	r3, r3, lsr #16		/* r3 = ..76 */
 	orr	r3, r3, r2, lsl #16	/* r3 = 9876 */
 	mov	r2, r2, lsr #16		/* r2 = ..BA */
-#endif
 	str	r1, [r0, #0x02]
 	str	r3, [r0, #0x06]
 	strh	r2, [r0, #0x0a]
 	RET
 	LMEMCPY_C_PAD
 
 /*
  * 1001: dst is 16-bit aligned, src is 8-bit aligned (byte 1)
  */
 	ldr	r2, [r1, #-1]		/* BE:r2 = x012  LE:r2 = 210x */
 	ldr	r3, [r1, #0x03]		/* BE:r3 = 3456  LE:r3 = 6543 */
 	mov	ip, r2, lsr #8		/* BE:ip = .x01  LE:ip = .210 */
 	strh	ip, [r0]
 	ldr	ip, [r1, #0x07]		/* BE:ip = 789A  LE:ip = A987 */
 	ldrb	r1, [r1, #0x0b]		/* r1 = ...B */
-#ifdef __ARMEB__
-	mov	r2, r2, lsl #24		/* r2 = 2... */
-	orr	r2, r2, r3, lsr #8	/* r2 = 2345 */
-	mov	r3, r3, lsl #24		/* r3 = 6... */
-	orr	r3, r3, ip, lsr #8	/* r3 = 6789 */
-	orr	r1, r1, ip, lsl #8	/* r1 = 89AB */
-#else
 	mov	r2, r2, lsr #24		/* r2 = ...2 */
 	orr	r2, r2, r3, lsl #8	/* r2 = 5432 */
 	mov	r3, r3, lsr #24		/* r3 = ...6 */
 	orr	r3, r3, ip, lsl #8	/* r3 = 9876 */
 	mov	r1, r1, lsl #8		/* r1 = ..B. */
 	orr	r1, r1, ip, lsr #24	/* r1 = ..BA */
-#endif
 	str	r2, [r0, #0x02]
 	str	r3, [r0, #0x06]
 	strh	r1, [r0, #0x0a]
 	RET
 	LMEMCPY_C_PAD
 
 /*
  * 1010: dst is 16-bit aligned, src is 16-bit aligned
  */
 	ldrh	r2, [r1]
 	ldr	r3, [r1, #0x02]
 	ldr	ip, [r1, #0x06]
 	ldrh	r1, [r1, #0x0a]
 	strh	r2, [r0]
 	str	r3, [r0, #0x02]
 	str	ip, [r0, #0x06]
 	strh	r1, [r0, #0x0a]
 	RET
 	LMEMCPY_C_PAD
 
 /*
  * 1011: dst is 16-bit aligned, src is 8-bit aligned (byte 3)
  */
 	ldr	r2, [r1, #0x09]		/* BE:r2 = 9ABx  LE:r2 = xBA9 */
 	ldr	r3, [r1, #0x05]		/* BE:r3 = 5678  LE:r3 = 8765 */
 	mov	ip, r2, lsr #8		/* BE:ip = .9AB  LE:ip = .xBA */
 	strh	ip, [r0, #0x0a]
 	ldr	ip, [r1, #0x01]		/* BE:ip = 1234  LE:ip = 4321 */
 	ldrb	r1, [r1]		/* r1 = ...0 */
-#ifdef __ARMEB__
-	mov	r2, r2, lsr #24		/* r2 = ...9 */
-	orr	r2, r2, r3, lsl #8	/* r2 = 6789 */
-	mov	r3, r3, lsr #24		/* r3 = ...5 */
-	orr	r3, r3, ip, lsl #8	/* r3 = 2345 */
-	mov	r1, r1, lsl #8		/* r1 = ..0. */
-	orr	r1, r1, ip, lsr #24	/* r1 = ..01 */
-#else
 	mov	r2, r2, lsl #24		/* r2 = 9... */
 	orr	r2, r2, r3, lsr #8	/* r2 = 9876 */
 	mov	r3, r3, lsl #24		/* r3 = 5... */
 	orr	r3, r3, ip, lsr #8	/* r3 = 5432 */
 	orr	r1, r1, ip, lsl #8	/* r1 = 3210 */
-#endif
 	str	r2, [r0, #0x06]
 	str	r3, [r0, #0x02]
 	strh	r1, [r0]
 	RET
 	LMEMCPY_C_PAD
 
 /*
  * 1100: dst is 8-bit aligned (byte 3), src is 32-bit aligned
  */
 	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
 	ldr	ip, [r1, #0x04]		/* BE:ip = 4567  LE:ip = 7654 */
 	ldr	r1, [r1, #0x08]		/* BE:r1 = 89AB  LE:r1 = BA98 */
-#ifdef __ARMEB__
-	mov	r3, r2, lsr #24		/* r3 = ...0 */
-	strb	r3, [r0]
-	mov	r2, r2, lsl #8		/* r2 = 123. */
-	orr	r2, r2, ip, lsr #24	/* r2 = 1234 */
-	str	r2, [r0, #0x01]
-	mov	r2, ip, lsl #8		/* r2 = 567. */
-	orr	r2, r2, r1, lsr #24	/* r2 = 5678 */
-	str	r2, [r0, #0x05]
-	mov	r2, r1, lsr #8		/* r2 = ..9A */
-	strh	r2, [r0, #0x09]
-	strb	r1, [r0, #0x0b]
-#else
 	strb	r2, [r0]
 	mov	r3, r2, lsr #8		/* r3 = .321 */
 	orr	r3, r3, ip, lsl #24	/* r3 = 4321 */
 	str	r3, [r0, #0x01]
 	mov	r3, ip, lsr #8		/* r3 = .765 */
 	orr	r3, r3, r1, lsl #24	/* r3 = 8765 */
 	str	r3, [r0, #0x05]
 	mov	r1, r1, lsr #8		/* r1 = .BA9 */
 	strh	r1, [r0, #0x09]
 	mov	r1, r1, lsr #16		/* r1 = ...B */
 	strb	r1, [r0, #0x0b]
-#endif
 	RET
 	LMEMCPY_C_PAD
 
 /*
  * 1101: dst is 8-bit aligned (byte 3), src is 8-bit aligned (byte 1)
  */
 	ldrb	r2, [r1, #0x0b]		/* r2 = ...B */
 	ldr	r3, [r1, #0x07]		/* BE:r3 = 789A  LE:r3 = A987 */
 	ldr	ip, [r1, #0x03]		/* BE:ip = 3456  LE:ip = 6543 */
 	ldr	r1, [r1, #-1]		/* BE:r1 = x012  LE:r1 = 210x */
 	strb	r2, [r0, #0x0b]
-#ifdef __ARMEB__
-	strh	r3, [r0, #0x09]
-	mov	r3, r3, lsr #16		/* r3 = ..78 */
-	orr	r3, r3, ip, lsl #16	/* r3 = 5678 */
-	mov	ip, ip, lsr #16		/* ip = ..34 */
-	orr	ip, ip, r1, lsl #16	/* ip = 1234 */
-	mov	r1, r1, lsr #16		/* r1 = ..x0 */
-#else
 	mov	r2, r3, lsr #16		/* r2 = ..A9 */
 	strh	r2, [r0, #0x09]
 	mov	r3, r3, lsl #16		/* r3 = 87.. */
 	orr	r3, r3, ip, lsr #16	/* r3 = 8765 */
 	mov	ip, ip, lsl #16		/* ip = 43.. */
 	orr	ip, ip, r1, lsr #16	/* ip = 4321 */
 	mov	r1, r1, lsr #8		/* r1 = .210 */
-#endif
 	str	r3, [r0, #0x05]
 	str	ip, [r0, #0x01]
 	strb	r1, [r0]
 	RET
 	LMEMCPY_C_PAD
 
 /*
  * 1110: dst is 8-bit aligned (byte 3), src is 16-bit aligned
  */
-#ifdef __ARMEB__
-	ldrh	r2, [r1, #0x0a]		/* r2 = ..AB */
-	ldr	ip, [r1, #0x06]		/* ip = 6789 */
-	ldr	r3, [r1, #0x02]		/* r3 = 2345 */
-	ldrh	r1, [r1]		/* r1 = ..01 */
-	strb	r2, [r0, #0x0b]
-	mov	r2, r2, lsr #8		/* r2 = ...A */
-	orr	r2, r2, ip, lsl #8	/* r2 = 789A */
-	mov	ip, ip, lsr #8		/* ip = .678 */
-	orr	ip, ip, r3, lsl #24	/* ip = 5678 */
-	mov	r3, r3, lsr #8		/* r3 = .234 */
-	orr	r3, r3, r1, lsl #24	/* r3 = 1234 */
-	mov	r1, r1, lsr #8		/* r1 = ...0 */
-	strb	r1, [r0]
-	str	r3, [r0, #0x01]
-	str	ip, [r0, #0x05]
-	strh	r2, [r0, #0x09]
-#else
 	ldrh	r2, [r1]		/* r2 = ..10 */
 	ldr	r3, [r1, #0x02]		/* r3 = 5432 */
 	ldr	ip, [r1, #0x06]		/* ip = 9876 */
 	ldrh	r1, [r1, #0x0a]		/* r1 = ..BA */
 	strb	r2, [r0]
 	mov	r2, r2, lsr #8		/* r2 = ...1 */
 	orr	r2, r2, r3, lsl #8	/* r2 = 4321 */
 	mov	r3, r3, lsr #24		/* r3 = ...5 */
 	orr	r3, r3, ip, lsl #8	/* r3 = 8765 */
 	mov	ip, ip, lsr #24		/* ip = ...9 */
 	orr	ip, ip, r1, lsl #8	/* ip = .BA9 */
 	mov	r1, r1, lsr #8		/* r1 = ...B */
 	str	r2, [r0, #0x01]
 	str	r3, [r0, #0x05]
 	strh	ip, [r0, #0x09]
 	strb	r1, [r0, #0x0b]
-#endif
 	RET
 	LMEMCPY_C_PAD
 
 /*
  * 1111: dst is 8-bit aligned (byte 3), src is 8-bit aligned (byte 3)
  */
 	ldrb	r2, [r1]
 	ldr	r3, [r1, #0x01]
 	ldr	ip, [r1, #0x05]
 	strb	r2, [r0]
 	ldrh	r2, [r1, #0x09]
 	ldrb	r1, [r1, #0x0b]
 	str	r3, [r0, #0x01]
 	str	ip, [r0, #0x05]
 	strh	r2, [r0, #0x09]
 	strb	r1, [r0, #0x0b]
 	RET
 END(memcpy)
 #endif /* _ARM_ARCH_5E */
 
 #ifdef GPROF
 
 ENTRY(user)
 	nop
 END(user)
 ENTRY(btrap)
 	nop
 END(btrap)
 ENTRY(etrap)
 	nop
 END(etrap)
 ENTRY(bintr)
 	nop
 END(bintr)
 ENTRY(eintr)
 	nop
 END(eintr)
 #endif
Index: head/sys/arm/arm/vm_machdep.c
===================================================================
--- head/sys/arm/arm/vm_machdep.c	(revision 368152)
+++ head/sys/arm/arm/vm_machdep.c	(revision 368153)
@@ -1,346 +1,320 @@
 /*-
  * SPDX-License-Identifier: BSD-4-Clause
  *
  * Copyright (c) 1982, 1986 The Regents of the University of California.
  * Copyright (c) 1989, 1990 William Jolitz
  * Copyright (c) 1994 John Dyson
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the Systems Programming Group of the University of Utah Computer
  * Science Department, and William Jolitz.
  *
  * Redistribution and use in source and binary :forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)vm_machdep.c	7.3 (Berkeley) 5/13/91
  *	Utah $Hdr: vm_machdep.c 1.16.1.1 89/06/23$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/proc.h>
 #include <sys/socketvar.h>
 #include <sys/syscall.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/unistd.h>
 
 #include <machine/cpu.h>
 #include <machine/frame.h>
 #include <machine/pcb.h>
 #include <machine/sysarch.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_param.h>
 #include <vm/vm_pageout.h>
 #include <vm/uma.h>
 #include <vm/uma_int.h>
 
 #include <machine/md_var.h>
 #include <machine/vfp.h>
 
 /*
  * struct switchframe and trapframe must both be a multiple of 8
  * for correct stack alignment.
  */
 _Static_assert((sizeof(struct switchframe) % 8) == 0, "Bad alignment");
 _Static_assert((sizeof(struct trapframe) % 8) == 0, "Bad alignment");
 
 uint32_t initial_fpscr = VFPSCR_DN | VFPSCR_FZ;
 
 /*
  * Finish a fork operation, with process p2 nearly set up.
  * Copy and update the pcb, set up the stack so that the child
  * ready to run and return to user mode.
  */
 void
 cpu_fork(struct thread *td1, struct proc *p2, struct thread *td2, int flags)
 {
 	struct pcb *pcb2;
 	struct trapframe *tf;
 	struct mdproc *mdp2;
 
 	if ((flags & RFPROC) == 0)
 		return;
 
 	/* Point the pcb to the top of the stack */
 	pcb2 = (struct pcb *)
 	    (td2->td_kstack + td2->td_kstack_pages * PAGE_SIZE) - 1;
 #ifdef VFP
 	/* Store actual state of VFP */
 	if (curthread == td1) {
 		critical_enter();
 		vfp_store(&td1->td_pcb->pcb_vfpstate, false);
 		critical_exit();
 	}
 #endif
 	td2->td_pcb = pcb2;
 
 	/* Clone td1's pcb */
 	bcopy(td1->td_pcb, pcb2, sizeof(*pcb2));
 
 	/* Point to mdproc and then copy over td1's contents */
 	mdp2 = &p2->p_md;
 	bcopy(&td1->td_proc->p_md, mdp2, sizeof(*mdp2));
 
 	/* Point the frame to the stack in front of pcb and copy td1's frame */
 	td2->td_frame = (struct trapframe *)pcb2 - 1;
 	*td2->td_frame = *td1->td_frame;
 
 	/*
 	 * Create a new fresh stack for the new process.
 	 * Copy the trap frame for the return to user mode as if from a
 	 * syscall.  This copies most of the user mode register values.
 	 */
 	pmap_set_pcb_pagedir(vmspace_pmap(p2->p_vmspace), pcb2);
 	pcb2->pcb_regs.sf_r4 = (register_t)fork_return;
 	pcb2->pcb_regs.sf_r5 = (register_t)td2;
 	pcb2->pcb_regs.sf_lr = (register_t)fork_trampoline;
 	pcb2->pcb_regs.sf_sp = STACKALIGN(td2->td_frame);
 	pcb2->pcb_regs.sf_tpidrurw = (register_t)get_tls();
 
 	pcb2->pcb_vfpcpu = -1;
 	pcb2->pcb_vfpstate.fpscr = initial_fpscr;
 
 	tf = td2->td_frame;
 	tf->tf_spsr &= ~PSR_C;
 	tf->tf_r0 = 0;
 	tf->tf_r1 = 0;
 
 	/* Setup to release spin count in fork_exit(). */
 	td2->td_md.md_spinlock_count = 1;
 	td2->td_md.md_saved_cspr = PSR_SVC32_MODE;
 }
 
 void
 cpu_thread_swapin(struct thread *td)
 {
 }
 
 void
 cpu_thread_swapout(struct thread *td)
 {
 }
 
 void
 cpu_set_syscall_retval(struct thread *td, int error)
 {
 	struct trapframe *frame;
-	int fixup;
-#ifdef __ARMEB__
-	u_int call;
-#endif
 
 	frame = td->td_frame;
-	fixup = 0;
-
-#ifdef __ARMEB__
-	/*
-	 * __syscall returns an off_t while most other syscalls return an
-	 * int. As an off_t is 64-bits and an int is 32-bits we need to
-	 * place the returned data into r1. As the lseek and freebsd6_lseek
-	 * syscalls also return an off_t they do not need this fixup.
-	 */
-	call = frame->tf_r7;
-	if (call == SYS___syscall) {
-		register_t *ap = &frame->tf_r0;
-		register_t code = ap[_QUAD_LOWWORD];
-		fixup = (code != SYS_lseek);
-	}
-#endif
-
 	switch (error) {
 	case 0:
-		if (fixup) {
-			frame->tf_r0 = 0;
-			frame->tf_r1 = td->td_retval[0];
-		} else {
-			frame->tf_r0 = td->td_retval[0];
-			frame->tf_r1 = td->td_retval[1];
-		}
+		frame->tf_r0 = td->td_retval[0];
+		frame->tf_r1 = td->td_retval[1];
 		frame->tf_spsr &= ~PSR_C;   /* carry bit */
 		break;
 	case ERESTART:
 		/*
 		 * Reconstruct the pc to point at the swi.
 		 */
 #if __ARM_ARCH >= 7
 		if ((frame->tf_spsr & PSR_T) != 0)
 			frame->tf_pc -= THUMB_INSN_SIZE;
 		else
 #endif
 			frame->tf_pc -= INSN_SIZE;
 		break;
 	case EJUSTRETURN:
 		/* nothing to do */
 		break;
 	default:
 		frame->tf_r0 = error;
 		frame->tf_spsr |= PSR_C;    /* carry bit */
 		break;
 	}
 }
 
 /*
  * Initialize machine state, mostly pcb and trap frame for a new
  * thread, about to return to userspace.  Put enough state in the new
  * thread's PCB to get it to go back to the fork_return(), which
  * finalizes the thread state and handles peculiarities of the first
  * return to userspace for the new thread.
  */
 void
 cpu_copy_thread(struct thread *td, struct thread *td0)
 {
 
 	bcopy(td0->td_frame, td->td_frame, sizeof(struct trapframe));
 	bcopy(td0->td_pcb, td->td_pcb, sizeof(struct pcb));
 
 	td->td_pcb->pcb_regs.sf_r4 = (register_t)fork_return;
 	td->td_pcb->pcb_regs.sf_r5 = (register_t)td;
 	td->td_pcb->pcb_regs.sf_lr = (register_t)fork_trampoline;
 	td->td_pcb->pcb_regs.sf_sp = STACKALIGN(td->td_frame);
 
 	td->td_frame->tf_spsr &= ~PSR_C;
 	td->td_frame->tf_r0 = 0;
 
 	/* Setup to release spin count in fork_exit(). */
 	td->td_md.md_spinlock_count = 1;
 	td->td_md.md_saved_cspr = PSR_SVC32_MODE;
 }
 
 /*
  * Set that machine state for performing an upcall that starts
  * the entry function with the given argument.
  */
 void
 cpu_set_upcall(struct thread *td, void (*entry)(void *), void *arg,
 	stack_t *stack)
 {
 	struct trapframe *tf = td->td_frame;
 
 	tf->tf_usr_sp = STACKALIGN((int)stack->ss_sp + stack->ss_size);
 	tf->tf_pc = (int)entry;
 	tf->tf_r0 = (int)arg;
 	tf->tf_spsr = PSR_USR32_MODE;
 }
 
 int
 cpu_set_user_tls(struct thread *td, void *tls_base)
 {
 
 	td->td_pcb->pcb_regs.sf_tpidrurw = (register_t)tls_base;
 	if (td == curthread)
 		set_tls(tls_base);
 	return (0);
 }
 
 void
 cpu_thread_exit(struct thread *td)
 {
 }
 
 void
 cpu_thread_alloc(struct thread *td)
 {
 	td->td_pcb = (struct pcb *)(td->td_kstack + td->td_kstack_pages *
 	    PAGE_SIZE) - 1;
 	/*
 	 * Ensure td_frame is aligned to an 8 byte boundary as it will be
 	 * placed into the stack pointer which must be 8 byte aligned in
 	 * the ARM EABI.
 	 */
 	td->td_frame = (struct trapframe *)((caddr_t)td->td_pcb) - 1;
 }
 
 void
 cpu_thread_free(struct thread *td)
 {
 }
 
 void
 cpu_thread_clean(struct thread *td)
 {
 }
 
 /*
  * Intercept the return address from a freshly forked process that has NOT
  * been scheduled yet.
  *
  * This is needed to make kernel threads stay in kernel mode.
  */
 void
 cpu_fork_kthread_handler(struct thread *td, void (*func)(void *), void *arg)
 {
 	td->td_pcb->pcb_regs.sf_r4 = (register_t)func;	/* function */
 	td->td_pcb->pcb_regs.sf_r5 = (register_t)arg;	/* first arg */
 }
 
 /*
  * Software interrupt handler for queued VM system processing.
  */
 void
 swi_vm(void *dummy)
 {
 
 	if (busdma_swi_pending)
 		busdma_swi();
 }
 
 void
 cpu_exit(struct thread *td)
 {
 }
 
 bool
 cpu_exec_vmspace_reuse(struct proc *p __unused, vm_map_t map __unused)
 {
 
 	return (true);
 }
 
 int
 cpu_procctl(struct thread *td __unused, int idtype __unused, id_t id __unused,
     int com __unused, void *data __unused)
 {
 
 	return (EINVAL);
 }