Index: head/sys/arm/arm/bcopy_page.S
===================================================================
--- head/sys/arm/arm/bcopy_page.S	(revision 172613)
+++ head/sys/arm/arm/bcopy_page.S	(revision 172614)
@@ -1,276 +1,276 @@
 /*	$NetBSD: bcopy_page.S,v 1.7 2003/10/13 21:03:13 scw Exp $	*/
 
 /*-
  * Copyright (c) 1995 Scott Stevens
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by Scott Stevens.
  * 4. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * RiscBSD kernel project
  *
  * bcopy_page.S
  *
  * page optimised bcopy and bzero routines
  *
  * Created      : 08/04/95
  */
 
 #include <machine/asm.h>
 
 __FBSDID("$FreeBSD$");
 
 #include "assym.s"
 
-#ifndef __XSCALE__
+#ifndef _ARM_ARCH_5E
 
 /* #define BIG_LOOPS */
 
 /*
  * bcopy_page(src, dest)
  *
  * Optimised copy page routine.
  *
  * On entry:
  *   r0 - src address
  *   r1 - dest address
  *
  * Requires:
  *   number of bytes per page (PAGE_SIZE) is a multiple of 512 (BIG_LOOPS), 128
  *   otherwise.
  */
 
 #define	CHUNK_SIZE	32
 
 #define	PREFETCH_FIRST_CHUNK	/* nothing */
 #define	PREFETCH_NEXT_CHUNK	/* nothing */
 
 #ifndef COPY_CHUNK
 #define	COPY_CHUNK \
 	PREFETCH_NEXT_CHUNK ; \
 	ldmia	r0!, {r3-r8,ip,lr} ; \
 	stmia	r1!, {r3-r8,ip,lr}
 #endif /* ! COPY_CHUNK */
 
 #ifndef SAVE_REGS
 #define	SAVE_REGS	stmfd	sp!, {r4-r8, lr}
 #define	RESTORE_REGS	ldmfd	sp!, {r4-r8, pc}
 #endif
 
 ENTRY(bcopy_page)
 	PREFETCH_FIRST_CHUNK
 	SAVE_REGS
 #ifdef BIG_LOOPS
 	mov	r2, #(PAGE_SIZE >> 9)
 #else
 	mov	r2, #(PAGE_SIZE >> 7)
 #endif
 
 1:
 	COPY_CHUNK
 	COPY_CHUNK
 	COPY_CHUNK
 	COPY_CHUNK
 
 #ifdef BIG_LOOPS
 	/* There is little point making the loop any larger; unless we are
 	   running with the cache off, the load/store overheads will
 	   completely dominate this loop.  */
 	COPY_CHUNK
 	COPY_CHUNK
 	COPY_CHUNK
 	COPY_CHUNK
 
 	COPY_CHUNK
 	COPY_CHUNK
 	COPY_CHUNK
 	COPY_CHUNK
 
 	COPY_CHUNK
 	COPY_CHUNK
 	COPY_CHUNK
 	COPY_CHUNK
 #endif
 	subs	r2, r2, #1
 	bne	1b
 
 	RESTORE_REGS		/* ...and return. */
 
 /*
  * bzero_page(dest)
  *
  * Optimised zero page routine.
  *
  * On entry:
  *   r0 - dest address
  *
  * Requires:
  *   number of bytes per page (PAGE_SIZE) is a multiple of 512 (BIG_LOOPS), 128
  *   otherwise
  */
 
 ENTRY(bzero_page)
 	stmfd	sp!, {r4-r8, lr}
 #ifdef BIG_LOOPS
 	mov	r2, #(PAGE_SIZE >> 9)
 #else
 	mov	r2, #(PAGE_SIZE >> 7)
 #endif
 	mov	r3, #0
 	mov	r4, #0
 	mov	r5, #0
 	mov	r6, #0
 	mov	r7, #0
 	mov	r8, #0
 	mov	ip, #0
 	mov	lr, #0
 
 1:
 	stmia	r0!, {r3-r8,ip,lr}
 	stmia	r0!, {r3-r8,ip,lr}
 	stmia	r0!, {r3-r8,ip,lr}
 	stmia	r0!, {r3-r8,ip,lr}
 
 #ifdef BIG_LOOPS
 	/* There is little point making the loop any larger; unless we are
 	   running with the cache off, the load/store overheads will
 	   completely dominate this loop.  */
 	stmia	r0!, {r3-r8,ip,lr}
 	stmia	r0!, {r3-r8,ip,lr}
 	stmia	r0!, {r3-r8,ip,lr}
 	stmia	r0!, {r3-r8,ip,lr}
 
 	stmia	r0!, {r3-r8,ip,lr}
 	stmia	r0!, {r3-r8,ip,lr}
 	stmia	r0!, {r3-r8,ip,lr}
 	stmia	r0!, {r3-r8,ip,lr}
 
 	stmia	r0!, {r3-r8,ip,lr}
 	stmia	r0!, {r3-r8,ip,lr}
 	stmia	r0!, {r3-r8,ip,lr}
 	stmia	r0!, {r3-r8,ip,lr}
 
 #endif
 
 	subs	r2, r2, #1
 	bne	1b
 
 	ldmfd	sp!, {r4-r8, pc}
 
-#else	/* __XSCALE__ */
+#else	/* _ARM_ARCH_5E */
 
 /*
- * XSCALE version of bcopy_page
+ * armv5e version of bcopy_page
  */
 ENTRY(bcopy_page)
 	pld	[r0]
 	stmfd	sp!, {r4, r5}
 	mov	ip, #32
 	ldr	r2, [r0], #0x04		/* 0x00 */
 	ldr	r3, [r0], #0x04		/* 0x04 */
 1:	pld	[r0, #0x18]		/* Prefetch 0x20 */
 	ldr	r4, [r0], #0x04		/* 0x08 */
 	ldr	r5, [r0], #0x04		/* 0x0c */
 	strd	r2, [r1], #0x08
 	ldr	r2, [r0], #0x04		/* 0x10 */
 	ldr	r3, [r0], #0x04		/* 0x14 */
 	strd	r4, [r1], #0x08
 	ldr	r4, [r0], #0x04		/* 0x18 */
 	ldr	r5, [r0], #0x04		/* 0x1c */
 	strd	r2, [r1], #0x08
 	ldr	r2, [r0], #0x04		/* 0x20 */
 	ldr	r3, [r0], #0x04		/* 0x24 */
 	pld	[r0, #0x18]		/* Prefetch 0x40 */
 	strd	r4, [r1], #0x08
 	ldr	r4, [r0], #0x04		/* 0x28 */
 	ldr	r5, [r0], #0x04		/* 0x2c */
 	strd	r2, [r1], #0x08
 	ldr	r2, [r0], #0x04		/* 0x30 */
 	ldr	r3, [r0], #0x04		/* 0x34 */
 	strd	r4, [r1], #0x08
 	ldr	r4, [r0], #0x04		/* 0x38 */
 	ldr	r5, [r0], #0x04		/* 0x3c */
 	strd	r2, [r1], #0x08
 	ldr	r2, [r0], #0x04		/* 0x40 */
 	ldr	r3, [r0], #0x04		/* 0x44 */
 	pld	[r0, #0x18]		/* Prefetch 0x60 */
 	strd	r4, [r1], #0x08
 	ldr	r4, [r0], #0x04		/* 0x48 */
 	ldr	r5, [r0], #0x04		/* 0x4c */
 	strd	r2, [r1], #0x08
 	ldr	r2, [r0], #0x04		/* 0x50 */
 	ldr	r3, [r0], #0x04		/* 0x54 */
 	strd	r4, [r1], #0x08
 	ldr	r4, [r0], #0x04		/* 0x58 */
 	ldr	r5, [r0], #0x04		/* 0x5c */
 	strd	r2, [r1], #0x08
 	ldr	r2, [r0], #0x04		/* 0x60 */
 	ldr	r3, [r0], #0x04		/* 0x64 */
 	pld	[r0, #0x18]		/* Prefetch 0x80 */
 	strd	r4, [r1], #0x08
 	ldr	r4, [r0], #0x04		/* 0x68 */
 	ldr	r5, [r0], #0x04		/* 0x6c */
 	strd	r2, [r1], #0x08
 	ldr	r2, [r0], #0x04		/* 0x70 */
 	ldr	r3, [r0], #0x04		/* 0x74 */
 	strd	r4, [r1], #0x08
 	ldr	r4, [r0], #0x04		/* 0x78 */
 	ldr	r5, [r0], #0x04		/* 0x7c */
 	strd	r2, [r1], #0x08
 	subs	ip, ip, #0x01
 	ldrgt	r2, [r0], #0x04		/* 0x80 */
 	ldrgt	r3, [r0], #0x04		/* 0x84 */
 	strd	r4, [r1], #0x08
 	bgt	1b
 	ldmfd	sp!, {r4, r5}
 	RET
 
 /*
- * XSCALE version of bzero_page
+ * armv5e version of bzero_page
  */
 ENTRY(bzero_page)
 	mov	r1, #PAGE_SIZE
 	mov	r2, #0
 	mov	r3, #0
 1:	strd	r2, [r0], #8		/* 32 */
 	strd	r2, [r0], #8
 	strd	r2, [r0], #8
 	strd	r2, [r0], #8
 	strd	r2, [r0], #8		/* 64 */
 	strd	r2, [r0], #8
 	strd	r2, [r0], #8
 	strd	r2, [r0], #8
 	strd	r2, [r0], #8		/* 96 */
 	strd	r2, [r0], #8
 	strd	r2, [r0], #8
 	strd	r2, [r0], #8
 	strd	r2, [r0], #8		/* 128 */
 	strd	r2, [r0], #8
 	strd	r2, [r0], #8
 	strd	r2, [r0], #8
 	subs	r1, r1, #128
 	bne	1b
 	RET
-#endif	/* __XSCALE__ */
+#endif	/* _ARM_ARCH_5E */
Index: head/sys/arm/arm/bcopyinout.S
===================================================================
--- head/sys/arm/arm/bcopyinout.S	(revision 172613)
+++ head/sys/arm/arm/bcopyinout.S	(revision 172614)
@@ -1,657 +1,657 @@
 /*	$NetBSD: bcopyinout.S,v 1.11 2003/10/13 21:22:40 scw Exp $	*/
 
 /*-
  * Copyright (c) 2002 Wasabi Systems, Inc.
  * All rights reserved.
  *
  * Written by Allen Briggs for Wasabi Systems, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *      This product includes software developed for the NetBSD Project by
  *      Wasabi Systems, Inc.
  * 4. The name of Wasabi Systems, Inc. may not be used to endorse
  *    or promote products derived from this software without specific prior
  *    written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
 
 #include "assym.s"
 
 #include <machine/asm.h>
 
 .L_arm_memcpy:
 	.word	_C_LABEL(_arm_memcpy)
 .L_min_memcpy_size:
 	.word	_C_LABEL(_min_memcpy_size)
 
 __FBSDID("$FreeBSD$");
-#ifdef __XSCALE__
+#ifdef _ARM_ARCH_5E
 #include <arm/arm/bcopyinout_xscale.S>
 #else
 
 	.text
 	.align	0
 
 #ifdef MULTIPROCESSOR
 .Lcpu_info:
 	.word	_C_LABEL(cpu_info)
 #else
 .Lcurpcb:
 	.word _C_LABEL(__pcpu) + PC_CURPCB
 #endif
 
 #define SAVE_REGS	stmfd	sp!, {r4-r11}
 #define RESTORE_REGS	ldmfd	sp!, {r4-r11}
 		
-#if defined(__XSCALE__)
+#if defined(_ARM_ARCH_5E)
 #define HELLOCPP #
 #define PREFETCH(rx,o)	pld	[ rx , HELLOCPP (o) ]
 #else
 #define PREFETCH(rx,o)
 #endif
 
 /*
  * r0 = user space address
  * r1 = kernel space address
  * r2 = length
  *
  * Copies bytes from user space to kernel space
  *
  * We save/restore r4-r11:
  * r4-r11 are scratch
  */
 ENTRY(copyin)
 	/* Quick exit if length is zero */	
 	teq	r2, #0
 	moveq	r0, #0
 	RETeq
 
 	ldr	r3, .L_arm_memcpy
 	ldr	r3, [r3]
 	cmp	r3, #0
 	beq	.Lnormal
 	ldr	r3, .L_min_memcpy_size
 	ldr	r3, [r3]
 	cmp	r2, r3
 	blt	.Lnormal
 	stmfd	sp!, {r0-r2, r4, lr}
 	mov     r3, r0
 	mov     r0, r1
 	mov     r1, r3
 	mov     r3, #2 /* SRC_IS_USER */
 	ldr	r4, .L_arm_memcpy
 	mov	lr, pc
 	ldr	pc, [r4]
 	cmp     r0, #0
 	ldmfd   sp!, {r0-r2, r4, lr}
 	moveq	r0, #0
 	RETeq
 
 .Lnormal:
 	SAVE_REGS
 #ifdef MULTIPROCESSOR
 	/* XXX Probably not appropriate for non-Hydra SMPs */
 	stmfd	sp!, {r0-r2, r14}
 	bl	_C_LABEL(cpu_number)
 	ldr	r4, .Lcpu_info
 	ldr	r4, [r4, r0, lsl #2]
 	ldr	r4, [r4, #CI_CURPCB]
 	ldmfd	sp!, {r0-r2, r14}
 #else
 	ldr	r4, .Lcurpcb
 	ldr	r4, [r4]
 #endif
 
 	ldr	r5, [r4, #PCB_ONFAULT]
 	adr	r3, .Lcopyfault
 	str	r3, [r4, #PCB_ONFAULT]
 
 	PREFETCH(r0, 0)
 	PREFETCH(r1, 0)
 
 	/*
 	 * If not too many bytes, take the slow path.
 	 */
 	cmp	r2, #0x08
 	blt	.Licleanup
 
 	/*
 	 * Align destination to word boundary.
 	 */
 	and	r6, r1, #0x3
 	ldr	pc, [pc, r6, lsl #2]
 	b	.Lialend
 	.word	.Lialend
 	.word	.Lial3
 	.word	.Lial2
 	.word	.Lial1
 .Lial3:	ldrbt	r6, [r0], #1
 	sub	r2, r2, #1
 	strb	r6, [r1], #1
 .Lial2:	ldrbt	r7, [r0], #1
 	sub	r2, r2, #1
 	strb	r7, [r1], #1
 .Lial1:	ldrbt	r6, [r0], #1
 	sub	r2, r2, #1
 	strb	r6, [r1], #1
 .Lialend:
 
 	/*
 	 * If few bytes left, finish slow.
 	 */
 	cmp	r2, #0x08
 	blt	.Licleanup
 
 	/*
 	 * If source is not aligned, finish slow.
 	 */
 	ands	r3, r0, #0x03
 	bne	.Licleanup
 
 	cmp	r2, #0x60	/* Must be > 0x5f for unrolled cacheline */
 	blt	.Licleanup8
 
 	/*
 	 * Align destination to cacheline boundary.
 	 * If source and destination are nicely aligned, this can be a big
 	 * win.  If not, it's still cheaper to copy in groups of 32 even if
 	 * we don't get the nice cacheline alignment.
 	 */
 	and	r6, r1, #0x1f
 	ldr	pc, [pc, r6]
 	b	.Licaligned
 	.word	.Licaligned
 	.word	.Lical28
 	.word	.Lical24
 	.word	.Lical20
 	.word	.Lical16
 	.word	.Lical12
 	.word	.Lical8
 	.word	.Lical4
 .Lical28:ldrt	r6, [r0], #4
 	sub	r2, r2, #4
 	str	r6, [r1], #4
 .Lical24:ldrt	r7, [r0], #4
 	sub	r2, r2, #4
 	str	r7, [r1], #4
 .Lical20:ldrt	r6, [r0], #4
 	sub	r2, r2, #4
 	str	r6, [r1], #4
 .Lical16:ldrt	r7, [r0], #4
 	sub	r2, r2, #4
 	str	r7, [r1], #4
 .Lical12:ldrt	r6, [r0], #4
 	sub	r2, r2, #4
 	str	r6, [r1], #4
 .Lical8:ldrt	r7, [r0], #4
 	sub	r2, r2, #4
 	str	r7, [r1], #4
 .Lical4:ldrt	r6, [r0], #4
 	sub	r2, r2, #4
 	str	r6, [r1], #4
 
 	/*
 	 * We start with > 0x40 bytes to copy (>= 0x60 got us into this
 	 * part of the code, and we may have knocked that down by as much
 	 * as 0x1c getting aligned).
 	 *
 	 * This loop basically works out to:
 	 * do {
 	 * 	prefetch-next-cacheline(s)
 	 *	bytes -= 0x20;
 	 *	copy cacheline
 	 * } while (bytes >= 0x40);
 	 * bytes -= 0x20;
 	 * copy cacheline
 	 */
 .Licaligned:
 	PREFETCH(r0, 32)
 	PREFETCH(r1, 32)
 
 	sub	r2, r2, #0x20
 
 	/* Copy a cacheline */
 	ldrt	r10, [r0], #4
 	ldrt	r11, [r0], #4
 	ldrt	r6, [r0], #4
 	ldrt	r7, [r0], #4
 	ldrt	r8, [r0], #4
 	ldrt	r9, [r0], #4
 	stmia	r1!, {r10-r11}
 	ldrt	r10, [r0], #4
 	ldrt	r11, [r0], #4
 	stmia	r1!, {r6-r11}
 
 	cmp	r2, #0x40
 	bge	.Licaligned
 
 	sub	r2, r2, #0x20
 
 	/* Copy a cacheline */
 	ldrt	r10, [r0], #4
 	ldrt	r11, [r0], #4
 	ldrt	r6, [r0], #4
 	ldrt	r7, [r0], #4
 	ldrt	r8, [r0], #4
 	ldrt	r9, [r0], #4
 	stmia	r1!, {r10-r11}
 	ldrt	r10, [r0], #4
 	ldrt	r11, [r0], #4
 	stmia	r1!, {r6-r11}
 
 	cmp	r2, #0x08
 	blt	.Liprecleanup
 
 .Licleanup8:
 	ldrt	r8, [r0], #4
 	ldrt	r9, [r0], #4
 	sub	r2, r2, #8
 	stmia	r1!, {r8, r9}
 	cmp	r2, #8
 	bge	.Licleanup8
 
 .Liprecleanup:
 	/*
 	 * If we're done, bail.
 	 */
 	cmp	r2, #0
 	beq	.Lout
 
 .Licleanup:
 	and	r6, r2, #0x3
 	ldr	pc, [pc, r6, lsl #2]
 	b	.Licend
 	.word	.Lic4
 	.word	.Lic1
 	.word	.Lic2
 	.word	.Lic3
 .Lic4:	ldrbt	r6, [r0], #1
 	sub	r2, r2, #1
 	strb	r6, [r1], #1
 .Lic3:	ldrbt	r7, [r0], #1
 	sub	r2, r2, #1
 	strb	r7, [r1], #1
 .Lic2:	ldrbt	r6, [r0], #1
 	sub	r2, r2, #1
 	strb	r6, [r1], #1
 .Lic1:	ldrbt	r7, [r0], #1
 	subs	r2, r2, #1
 	strb	r7, [r1], #1
 .Licend:
 	bne	.Licleanup
 
 .Liout:
 	mov	r0, #0
 
 	str	r5, [r4, #PCB_ONFAULT]
 	RESTORE_REGS
 
 	RET
 
 .Lcopyfault:
 	mov	r0, #14 /* EFAULT */
 	str	r5, [r4, #PCB_ONFAULT]
 	RESTORE_REGS
 
 	RET
 
 /*
  * r0 = kernel space address
  * r1 = user space address
  * r2 = length
  *
  * Copies bytes from kernel space to user space
  *
  * We save/restore r4-r11:
  * r4-r11 are scratch
  */
 
 ENTRY(copyout)
 	/* Quick exit if length is zero */	
 	teq	r2, #0
 	moveq	r0, #0
 	RETeq
 
 	ldr	r3, .L_arm_memcpy
 	ldr	r3, [r3]
 	cmp	r3, #0
 	beq	.Lnormale
 	ldr	r3, .L_min_memcpy_size
 	ldr	r3, [r3]
 	cmp	r2, r3
 	blt	.Lnormale
 	stmfd	sp!, {r0-r2, r4, lr}
 	mov     r3, r0
 	mov     r0, r1
 	mov     r1, r3
 	mov     r3, #1 /* DST_IS_USER */
 	ldr	r4, .L_arm_memcpy
 	mov	lr, pc
 	ldr	pc, [r4]
 	cmp     r0, #0
 	ldmfd   sp!, {r0-r2, r4, lr}
 	moveq	r0, #0
 	RETeq
 
 .Lnormale:
 	SAVE_REGS
 #ifdef MULTIPROCESSOR
 	/* XXX Probably not appropriate for non-Hydra SMPs */
 	stmfd	sp!, {r0-r2, r14}
 	bl	_C_LABEL(cpu_number)
 	ldr	r4, .Lcpu_info
 	ldr	r4, [r4, r0, lsl #2]
 	ldr	r4, [r4, #CI_CURPCB]
 	ldmfd	sp!, {r0-r2, r14}
 #else
 	ldr	r4, .Lcurpcb
 	ldr	r4, [r4]
 #endif
 
 	ldr	r5, [r4, #PCB_ONFAULT]
 	adr	r3, .Lcopyfault
 	str	r3, [r4, #PCB_ONFAULT]
 
 	PREFETCH(r0, 0)
 	PREFETCH(r1, 0)
 
 	/*
 	 * If not too many bytes, take the slow path.
 	 */
 	cmp	r2, #0x08
 	blt	.Lcleanup
 
 	/*
 	 * Align destination to word boundary.
 	 */
 	and	r6, r1, #0x3
 	ldr	pc, [pc, r6, lsl #2]
 	b	.Lalend
 	.word	.Lalend
 	.word	.Lal3
 	.word	.Lal2
 	.word	.Lal1
 .Lal3:	ldrb	r6, [r0], #1
 	sub	r2, r2, #1
 	strbt	r6, [r1], #1
 .Lal2:	ldrb	r7, [r0], #1
 	sub	r2, r2, #1
 	strbt	r7, [r1], #1
 .Lal1:	ldrb	r6, [r0], #1
 	sub	r2, r2, #1
 	strbt	r6, [r1], #1
 .Lalend:
 
 	/*
 	 * If few bytes left, finish slow.
 	 */
 	cmp	r2, #0x08
 	blt	.Lcleanup
 
 	/*
 	 * If source is not aligned, finish slow.
 	 */
 	ands	r3, r0, #0x03
 	bne	.Lcleanup
 
 	cmp	r2, #0x60	/* Must be > 0x5f for unrolled cacheline */
 	blt	.Lcleanup8
 
 	/*
 	 * Align source & destination to cacheline boundary.
 	 */
 	and	r6, r1, #0x1f
 	ldr	pc, [pc, r6]
 	b	.Lcaligned
 	.word	.Lcaligned
 	.word	.Lcal28
 	.word	.Lcal24
 	.word	.Lcal20
 	.word	.Lcal16
 	.word	.Lcal12
 	.word	.Lcal8
 	.word	.Lcal4
 .Lcal28:ldr	r6, [r0], #4
 	sub	r2, r2, #4
 	strt	r6, [r1], #4
 .Lcal24:ldr	r7, [r0], #4
 	sub	r2, r2, #4
 	strt	r7, [r1], #4
 .Lcal20:ldr	r6, [r0], #4
 	sub	r2, r2, #4
 	strt	r6, [r1], #4
 .Lcal16:ldr	r7, [r0], #4
 	sub	r2, r2, #4
 	strt	r7, [r1], #4
 .Lcal12:ldr	r6, [r0], #4
 	sub	r2, r2, #4
 	strt	r6, [r1], #4
 .Lcal8:	ldr	r7, [r0], #4
 	sub	r2, r2, #4
 	strt	r7, [r1], #4
 .Lcal4:	ldr	r6, [r0], #4
 	sub	r2, r2, #4
 	strt	r6, [r1], #4
 
 	/*
 	 * We start with > 0x40 bytes to copy (>= 0x60 got us into this
 	 * part of the code, and we may have knocked that down by as much
 	 * as 0x1c getting aligned).
 	 *
 	 * This loop basically works out to:
 	 * do {
 	 * 	prefetch-next-cacheline(s)
 	 *	bytes -= 0x20;
 	 *	copy cacheline
 	 * } while (bytes >= 0x40);
 	 * bytes -= 0x20;
 	 * copy cacheline
 	 */
 .Lcaligned:
 	PREFETCH(r0, 32)
 	PREFETCH(r1, 32)
 
 	sub	r2, r2, #0x20
 
 	/* Copy a cacheline */
 	ldmia	r0!, {r6-r11}
 	strt	r6, [r1], #4
 	strt	r7, [r1], #4
 	ldmia	r0!, {r6-r7}
 	strt	r8, [r1], #4
 	strt	r9, [r1], #4
 	strt	r10, [r1], #4
 	strt	r11, [r1], #4
 	strt	r6, [r1], #4
 	strt	r7, [r1], #4
 
 	cmp	r2, #0x40
 	bge	.Lcaligned
 
 	sub	r2, r2, #0x20
 
 	/* Copy a cacheline */
 	ldmia	r0!, {r6-r11}
 	strt	r6, [r1], #4
 	strt	r7, [r1], #4
 	ldmia	r0!, {r6-r7}
 	strt	r8, [r1], #4
 	strt	r9, [r1], #4
 	strt	r10, [r1], #4
 	strt	r11, [r1], #4
 	strt	r6, [r1], #4
 	strt	r7, [r1], #4
 
 	cmp	r2, #0x08
 	blt	.Lprecleanup
 
 .Lcleanup8:
 	ldmia	r0!, {r8-r9}
 	sub	r2, r2, #8
 	strt	r8, [r1], #4
 	strt	r9, [r1], #4
 	cmp	r2, #8
 	bge	.Lcleanup8
 
 .Lprecleanup:
 	/*
 	 * If we're done, bail.
 	 */
 	cmp	r2, #0
 	beq	.Lout
 
 .Lcleanup:
 	and	r6, r2, #0x3
 	ldr	pc, [pc, r6, lsl #2]
 	b	.Lcend
 	.word	.Lc4
 	.word	.Lc1
 	.word	.Lc2
 	.word	.Lc3
 .Lc4:	ldrb	r6, [r0], #1
 	sub	r2, r2, #1
 	strbt	r6, [r1], #1
 .Lc3:	ldrb	r7, [r0], #1
 	sub	r2, r2, #1
 	strbt	r7, [r1], #1
 .Lc2:	ldrb	r6, [r0], #1
 	sub	r2, r2, #1
 	strbt	r6, [r1], #1
 .Lc1:	ldrb	r7, [r0], #1
 	subs	r2, r2, #1
 	strbt	r7, [r1], #1
 .Lcend:
 	bne	.Lcleanup
 
 .Lout:
 	mov	r0, #0
 
 	str	r5, [r4, #PCB_ONFAULT]
 	RESTORE_REGS
 
 	RET
 #endif
 
 /*
  * int badaddr_read_1(const uint8_t *src, uint8_t *dest)
  *
  * Copies a single 8-bit value from src to dest, returning 0 on success,
  * else EFAULT if a page fault occurred.
  */
 ENTRY(badaddr_read_1)
 #ifdef MULTIPROCESSOR
 	/* XXX Probably not appropriate for non-Hydra SMPs */
 	stmfd	sp!, {r0-r1, r14}
 	bl	_C_LABEL(cpu_number)
 	ldr	r2, .Lcpu_info
 	ldr	r2, [r2, r0, lsl #2]
 	ldr	r2, [r2, #CI_CURPCB]
 	ldmfd	sp!, {r0-r1, r14}
 #else
 	ldr	r2, .Lcurpcb
 	ldr	r2, [r2]
 #endif
 	ldr	ip, [r2, #PCB_ONFAULT]
 	adr	r3, 1f
 	str	r3, [r2, #PCB_ONFAULT]
 	nop
 	nop
 	nop
 	ldrb	r3, [r0]
 	nop
 	nop
 	nop
 	strb	r3, [r1]
 	mov	r0, #0		/* No fault */
 1:	str	ip, [r2, #PCB_ONFAULT]
 	RET
 
 /*
  * int badaddr_read_2(const uint16_t *src, uint16_t *dest)
  *
  * Copies a single 16-bit value from src to dest, returning 0 on success,
  * else EFAULT if a page fault occurred.
  */
 ENTRY(badaddr_read_2)
 #ifdef MULTIPROCESSOR
 	/* XXX Probably not appropriate for non-Hydra SMPs */
 	stmfd	sp!, {r0-r1, r14}
 	bl	_C_LABEL(cpu_number)
 	ldr	r2, .Lcpu_info
 	ldr	r2, [r2, r0, lsl #2]
 	ldr	r2, [r2, #CI_CURPCB]
 	ldmfd	sp!, {r0-r1, r14}
 #else
 	ldr	r2, .Lcurpcb
 	ldr	r2, [r2]
 #endif
 	ldr	ip, [r2, #PCB_ONFAULT]
 	adr	r3, 1f
 	str	r3, [r2, #PCB_ONFAULT]
 	nop
 	nop
 	nop
 	ldrh	r3, [r0]
 	nop
 	nop
 	nop
 	strh	r3, [r1]
 	mov	r0, #0		/* No fault */
 1:	str	ip, [r2, #PCB_ONFAULT]
 	RET
 
 /*
  * int badaddr_read_4(const uint32_t *src, uint32_t *dest)
  *
  * Copies a single 32-bit value from src to dest, returning 0 on success,
  * else EFAULT if a page fault occurred.
  */
 ENTRY(badaddr_read_4)
 #ifdef MULTIPROCESSOR
 	/* XXX Probably not appropriate for non-Hydra SMPs */
 	stmfd	sp!, {r0-r1, r14}
 	bl	_C_LABEL(cpu_number)
 	ldr	r2, .Lcpu_info
 	ldr	r2, [r2, r0, lsl #2]
 	ldr	r2, [r2, #CI_CURPCB]
 	ldmfd	sp!, {r0-r1, r14}
 #else
 	ldr	r2, .Lcurpcb
 	ldr	r2, [r2]
 #endif
 	ldr	ip, [r2, #PCB_ONFAULT]
 	adr	r3, 1f
 	str	r3, [r2, #PCB_ONFAULT]
 	nop
 	nop
 	nop
 	ldr	r3, [r0]
 	nop
 	nop
 	nop
 	str	r3, [r1]
 	mov	r0, #0		/* No fault */
 1:	str	ip, [r2, #PCB_ONFAULT]
 	RET
 
Index: head/sys/arm/arm/in_cksum_arm.S
===================================================================
--- head/sys/arm/arm/in_cksum_arm.S	(revision 172613)
+++ head/sys/arm/arm/in_cksum_arm.S	(revision 172614)
@@ -1,339 +1,339 @@
 /*	$NetBSD: in_cksum_arm.S,v 1.2 2003/09/23 10:01:36 scw Exp $	*/
 
 /*-
  * Copyright 2003 Wasabi Systems, Inc.
  * All rights reserved.
  *
  * Written by Steve C. Woodford for Wasabi Systems, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *      This product includes software developed for the NetBSD Project by
  *      Wasabi Systems, Inc.
  * 4. The name of Wasabi Systems, Inc. may not be used to endorse
  *    or promote products derived from this software without specific prior
  *    written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  *
  */
 
 /*
- * Hand-optimised in_cksum() and in4_cksum() implementations for ARM/Xscale
+ * Hand-optimised in_cksum() and in4_cksum() implementations for ARM/armv5e
  */
 
 #include "opt_inet.h"
 
 #include <machine/asm.h>
 #include "assym.s"
 __FBSDID("$FreeBSD$");
 
 /*
  * int in_cksum(struct mbuf *m, int len)
  *
  * Entry:
  *	r0	m
  *	r1	len
  *
  * NOTE: Assumes 'm' is *never* NULL.
  */
 /* LINTSTUB: Func: int in_cksum(struct mbuf *, int) */
 ENTRY(in_cksum)
 	stmfd	sp!, {r4-r11,lr}
 	mov	r8, #0x00
 	mov	r9, r1
 	mov	r10, #0x00
 	mov	ip, r0
 
 .Lin_cksum_loop:
 	ldr	r1, [ip, #(M_LEN)]
 	ldr	r0, [ip, #(M_DATA)]
 	ldr	ip, [ip, #(M_NEXT)]
 .Lin_cksum_entry4:
 	cmp	r9, r1
 	movlt	r1, r9
 	sub	r9, r9, r1
 	eor	r11, r10, r0
 	add	r10, r10, r1
 	adds	r2, r1, #0x00
 	blne	_ASM_LABEL(L_cksumdata)
 	tst	r11, #0x01
 	movne	r2, r2, ror #8
 	adds	r8, r8, r2
 	adc	r8, r8, #0x00
 	cmp	ip, #0x00
 	bne	.Lin_cksum_loop
 
 	mov	r1, #0xff
 	orr	r1, r1, #0xff00
 	and	r0, r8, r1
 	add	r0, r0, r8, lsr #16
 	add	r0, r0, r0, lsr #16
 	and	r0, r0, r1
 	eor	r0, r0, r1
 	ldmfd	sp!, {r4-r11,pc}
 
 
 ENTRY(do_cksum)
 	stmfd	sp!, {r4-r7, lr}
 	bl	L_cksumdata
 	mov	r0, r2
 	ldmfd	sp!, {r4-r7, pc}
 /*
  * The main in*_cksum() workhorse...
  *
  * Entry parameters:
  *	r0	Pointer to buffer
  *	r1	Buffer length
  *	lr	Return address
  *
  * Returns:
  *	r2	Accumulated 32-bit sum
  *
  * Clobbers:
  *	r0-r7
  */
 /* LINTSTUB: Ignore */
 ASENTRY_NP(L_cksumdata)
-#ifdef __XSCALE__
+#ifdef _ARM_ARCH_5E
 	pld	[r0]			/* Pre-fetch the start of the buffer */
 #endif
 	mov	r2, #0
 
 	/* We first have to word-align the buffer.  */
 	ands	r7, r0, #0x03
 	beq	.Lcksumdata_wordaligned
 	rsb	r7, r7, #0x04
 	cmp	r1, r7			/* Enough bytes left to make it? */
 	blt	.Lcksumdata_endgame
 	cmp	r7, #0x02
 	ldrb	r4, [r0], #0x01		/* Fetch 1st byte */
 	ldrgeb	r5, [r0], #0x01		/* Fetch 2nd byte */
 	movlt	r5, #0x00
 	ldrgtb	r6, [r0], #0x01		/* Fetch 3rd byte */
 	movle	r6, #0x00
 	/* Combine the three bytes depending on endianness and alignment */
 #ifdef __ARMEB__
 	orreq	r2, r5, r4, lsl #8
 	orreq	r2, r2, r6, lsl #24
 	orrne	r2, r4, r5, lsl #8
 	orrne	r2, r2, r6, lsl #16
 #else
 	orreq	r2, r4, r5, lsl #8
 	orreq	r2, r2, r6, lsl #16
 	orrne	r2, r5, r4, lsl #8
 	orrne	r2, r2, r6, lsl #24
 #endif
 	subs	r1, r1, r7		/* Update length */
 	RETeq			/* All done? */
 
 	/* Buffer is now word aligned */
 .Lcksumdata_wordaligned:
-#ifdef __XSCALE__
+#ifdef _ARM_ARCH_5E
 	cmp	r1, #0x04		/* Less than 4 bytes left? */
 	blt	.Lcksumdata_endgame	/* Yup */
 
 	/* Now quad-align, if necessary */
 	ands	r7, r0, #0x04
 	ldrne	r7, [r0], #0x04
 	subne	r1, r1, #0x04
 	subs	r1, r1, #0x40
 	blt	.Lcksumdata_bigloop_end	/* Note: C flag clear if branch taken */
 
 	/*
 	 * Buffer is now quad aligned. Sum 64 bytes at a time.
 	 * Note: First ldrd is hoisted above the loop, together with
 	 * setting r6 to zero to avoid stalling for results in the
 	 * loop. (r7 is live, from above).
 	 */
 	ldrd	r4, [r0], #0x08
 	mov	r6, #0x00
 .Lcksumdata_bigloop:
 	pld	[r0, #0x18]
 	adds	r2, r2, r6
 	adcs	r2, r2, r7
 	ldrd	r6, [r0], #0x08
 	adcs	r2, r2, r4
 	adcs	r2, r2, r5
 	ldrd	r4, [r0], #0x08
 	adcs	r2, r2, r6
 	adcs	r2, r2, r7
 	ldrd	r6, [r0], #0x08
 	adcs	r2, r2, r4
 	adcs	r2, r2, r5
 	ldrd	r4, [r0], #0x08
 	adcs	r2, r2, r6
 	adcs	r2, r2, r7
 	pld	[r0, #0x18]
 	ldrd	r6, [r0], #0x08
 	adcs	r2, r2, r4
 	adcs	r2, r2, r5
 	ldrd	r4, [r0], #0x08
 	adcs	r2, r2, r6
 	adcs	r2, r2, r7
 	ldrd	r6, [r0], #0x08
 	adcs	r2, r2, r4
 	adcs	r2, r2, r5
 	adc	r2, r2, #0x00
 	subs	r1, r1, #0x40
 	ldrged	r4, [r0], #0x08
 	bge	.Lcksumdata_bigloop
 
 	adds	r2, r2, r6		/* r6/r7 still need summing */
 .Lcksumdata_bigloop_end:
 	adcs	r2, r2, r7
 	adc	r2, r2, #0x00
 
-#else	/* !__XSCALE__ */
+#else	/* !_ARM_ARCH_5E */
 
 	subs	r1, r1, #0x40
 	blt	.Lcksumdata_bigloop_end
 
 .Lcksumdata_bigloop:
 	ldmia	r0!, {r3, r4, r5, r6}
 	adds	r2, r2, r3
 	adcs	r2, r2, r4
 	adcs	r2, r2, r5
 	ldmia	r0!, {r3, r4, r5, r7}
 	adcs	r2, r2, r6
 	adcs	r2, r2, r3
 	adcs	r2, r2, r4
 	adcs	r2, r2, r5
 	ldmia	r0!, {r3, r4, r5, r6}
 	adcs	r2, r2, r7
 	adcs	r2, r2, r3
 	adcs	r2, r2, r4
 	adcs	r2, r2, r5
 	ldmia	r0!, {r3, r4, r5, r7}
 	adcs	r2, r2, r6
 	adcs	r2, r2, r3
 	adcs	r2, r2, r4
 	adcs	r2, r2, r5
 	adcs	r2, r2, r7
 	adc	r2, r2, #0x00
 	subs	r1, r1, #0x40
 	bge	.Lcksumdata_bigloop
 .Lcksumdata_bigloop_end:
 #endif
 
 	adds	r1, r1, #0x40
 	RETeq
 	cmp	r1, #0x20
 
-#ifdef __XSCALE__
+#ifdef _ARM_ARCH_5E
 	ldrged	r4, [r0], #0x08		/* Avoid stalling pld and result */
 	blt	.Lcksumdata_less_than_32
 	pld	[r0, #0x18]
 	ldrd	r6, [r0], #0x08
 	adds	r2, r2, r4
 	adcs	r2, r2, r5
 	ldrd	r4, [r0], #0x08
 	adcs	r2, r2, r6
 	adcs	r2, r2, r7
 	ldrd	r6, [r0], #0x08
 	adcs	r2, r2, r4
 	adcs	r2, r2, r5
 	adcs	r2, r2, r6		/* XXX: Unavoidable result stall */
 	adcs	r2, r2, r7
 #else
 	blt	.Lcksumdata_less_than_32
 	ldmia	r0!, {r3, r4, r5, r6}
 	adds	r2, r2, r3
 	adcs	r2, r2, r4
 	adcs	r2, r2, r5
 	ldmia	r0!, {r3, r4, r5, r7}
 	adcs	r2, r2, r6
 	adcs	r2, r2, r3
 	adcs	r2, r2, r4
 	adcs	r2, r2, r5
 	adcs	r2, r2, r7
 #endif
 	adc	r2, r2, #0x00
 	subs	r1, r1, #0x20
 	RETeq
 
 .Lcksumdata_less_than_32:
 	/* There are less than 32 bytes left */
 	and	r3, r1, #0x18
 	rsb	r4, r3, #0x18
 	sub	r1, r1, r3
 	adds	r4, r4, r4, lsr #1	/* Side effect: Clear carry flag */
 	addne	pc, pc, r4
 	nop
 
 /*
- * Note: We use ldm here, even on Xscale, since the combined issue/result
+ * Note: We use ldm here, even on armv5e, since the combined issue/result
  * latencies for ldm and ldrd are the same. Using ldm avoids needless #ifdefs.
  */
 	/* At least 24 bytes remaining... */
 	ldmia	r0!, {r4, r5}
 	adcs	r2, r2, r4
 	adcs	r2, r2, r5
 
 	/* At least 16 bytes remaining... */
 	ldmia	r0!, {r4, r5}
 	adcs	r2, r2, r4
 	adcs	r2, r2, r5
 
 	/* At least 8 bytes remaining... */
 	ldmia	r0!, {r4, r5}
 	adcs	r2, r2, r4
 	adcs	r2, r2, r5
 
 	/* Less than 8 bytes remaining... */
 	adc	r2, r2, #0x00
 	subs	r1, r1, #0x04
 	blt	.Lcksumdata_lessthan4
 
 	ldr	r4, [r0], #0x04
 	sub	r1, r1, #0x04
 	adds	r2, r2, r4
 	adc	r2, r2, #0x00
 
 	/* Deal with < 4 bytes remaining */
 .Lcksumdata_lessthan4:
 	adds	r1, r1, #0x04
 	RETeq
 
 	/* Deal with 1 to 3 remaining bytes, possibly misaligned */
 .Lcksumdata_endgame:
 	ldrb	r3, [r0]		/* Fetch first byte */
 	cmp	r1, #0x02
 	ldrgeb	r4, [r0, #0x01]		/* Fetch 2nd and 3rd as necessary */
 	movlt	r4, #0x00
 	ldrgtb	r5, [r0, #0x02]
 	movle	r5, #0x00
 	/* Combine the three bytes depending on endianness and alignment */
 	tst	r0, #0x01
 #ifdef __ARMEB__
 	orreq	r3, r4, r3, lsl #8
 	orreq	r3, r3, r5, lsl #24
 	orrne	r3, r3, r4, lsl #8
 	orrne	r3, r3, r5, lsl #16
 #else
 	orreq	r3, r3, r4, lsl #8
 	orreq	r3, r3, r5, lsl #16
 	orrne	r3, r4, r3, lsl #8
 	orrne	r3, r3, r5, lsl #24
 #endif
 	adds	r2, r2, r3
 	adc	r2, r2, #0x00
 	RET
Index: head/sys/arm/arm/support.S
===================================================================
--- head/sys/arm/arm/support.S	(revision 172613)
+++ head/sys/arm/arm/support.S	(revision 172614)
@@ -1,2888 +1,2888 @@
 /*-
  * Copyright (c) 2004 Olivier Houchard
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <machine/asm.h>
 #include <machine/asmacros.h>
 __FBSDID("$FreeBSD$");
 
 #include "assym.s"
 
 .L_arm_memcpy:
 	.word	_C_LABEL(_arm_memcpy)
 .L_arm_bzero:
 	.word	_C_LABEL(_arm_bzero)
 .L_min_memcpy_size:
 	.word	_C_LABEL(_min_memcpy_size)
 .L_min_bzero_size:
 	.word	_C_LABEL(_min_bzero_size)
 /*
  * memset: Sets a block of memory to the specified value
  *
  * On entry:
  *   r0 - dest address
  *   r1 - byte to write
  *   r2 - number of bytes to write
  *
  * On exit:
  *   r0 - dest address
  */
 /* LINTSTUB: Func: void bzero(void *, size_t) */
 ENTRY(bzero)
 	ldr	r3, .L_arm_bzero
 	ldr	r3, [r3]
 	cmp	r3, #0
 	beq	.Lnormal0
 	ldr	r2, .L_min_bzero_size
 	ldr	r2, [r2]
 	cmp	r1, r2
 	blt	.Lnormal0
 	stmfd	sp!, {r0, r1, lr}
 	mov	r2, #0
 	mov	lr, pc
 	mov	pc, r3
 	cmp	r0, #0
 	ldmfd	sp!, {r0, r1, lr}
 	RETeq
 .Lnormal0:
 	mov	r3, #0x00
 	b	do_memset
 
 /* LINTSTUB: Func: void *memset(void *, int, size_t) */
 ENTRY(memset)
 	and	r3, r1, #0xff		/* We deal with bytes */
 	mov	r1, r2
 do_memset:
 	cmp	r1, #0x04		/* Do we have less than 4 bytes */
 	mov	ip, r0
 	blt	.Lmemset_lessthanfour
 
 	/* Ok first we will word align the address */
 	ands	r2, ip, #0x03		/* Get the bottom two bits */
 	bne	.Lmemset_wordunaligned	/* The address is not word aligned */
 
 	/* We are now word aligned */
 .Lmemset_wordaligned:
 	orr	r3, r3, r3, lsl #8	/* Extend value to 16-bits */
-#ifdef __XSCALE__
-	tst	ip, #0x04		/* Quad-align for Xscale */
+#ifdef _ARM_ARCH_5E
+	tst	ip, #0x04		/* Quad-align for armv5e */
 #else
 	cmp	r1, #0x10
 #endif
 	orr	r3, r3, r3, lsl #16	/* Extend value to 32-bits */
-#ifdef __XSCALE__
+#ifdef _ARM_ARCH_5E
 	subne	r1, r1, #0x04		/* Quad-align if necessary */
 	strne	r3, [ip], #0x04
 	cmp	r1, #0x10
 #endif
 	blt	.Lmemset_loop4		/* If less than 16 then use words */
 	mov	r2, r3			/* Duplicate data */
 	cmp	r1, #0x80		/* If < 128 then skip the big loop */
 	blt	.Lmemset_loop32
 
 	/* Do 128 bytes at a time */
 .Lmemset_loop128:
 	subs	r1, r1, #0x80
-#ifdef __XSCALE__
+#ifdef _ARM_ARCH_5E
 	strged	r2, [ip], #0x08
 	strged	r2, [ip], #0x08
 	strged	r2, [ip], #0x08
 	strged	r2, [ip], #0x08
 	strged	r2, [ip], #0x08
 	strged	r2, [ip], #0x08
 	strged	r2, [ip], #0x08
 	strged	r2, [ip], #0x08
 	strged	r2, [ip], #0x08
 	strged	r2, [ip], #0x08
 	strged	r2, [ip], #0x08
 	strged	r2, [ip], #0x08
 	strged	r2, [ip], #0x08
 	strged	r2, [ip], #0x08
 	strged	r2, [ip], #0x08
 	strged	r2, [ip], #0x08
 #else
 	stmgeia	ip!, {r2-r3}
 	stmgeia	ip!, {r2-r3}
 	stmgeia	ip!, {r2-r3}
 	stmgeia	ip!, {r2-r3}
 	stmgeia	ip!, {r2-r3}
 	stmgeia	ip!, {r2-r3}
 	stmgeia	ip!, {r2-r3}
 	stmgeia	ip!, {r2-r3}
 	stmgeia	ip!, {r2-r3}
 	stmgeia	ip!, {r2-r3}
 	stmgeia	ip!, {r2-r3}
 	stmgeia	ip!, {r2-r3}
 	stmgeia	ip!, {r2-r3}
 	stmgeia	ip!, {r2-r3}
 	stmgeia	ip!, {r2-r3}
 	stmgeia	ip!, {r2-r3}
 #endif
 	bgt	.Lmemset_loop128
 	RETeq			/* Zero length so just exit */
 
 	add	r1, r1, #0x80		/* Adjust for extra sub */
 
 	/* Do 32 bytes at a time */
 .Lmemset_loop32:
 	subs	r1, r1, #0x20
-#ifdef __XSCALE__
+#ifdef _ARM_ARCH_5E
 	strged	r2, [ip], #0x08
 	strged	r2, [ip], #0x08
 	strged	r2, [ip], #0x08
 	strged	r2, [ip], #0x08
 #else
 	stmgeia	ip!, {r2-r3}
 	stmgeia	ip!, {r2-r3}
 	stmgeia	ip!, {r2-r3}
 	stmgeia	ip!, {r2-r3}
 #endif
 	bgt	.Lmemset_loop32
 	RETeq			/* Zero length so just exit */
 
 	adds	r1, r1, #0x10		/* Partially adjust for extra sub */
 
 	/* Deal with 16 bytes or more */
-#ifdef __XSCALE__
+#ifdef _ARM_ARCH_5E
 	strged	r2, [ip], #0x08
 	strged	r2, [ip], #0x08
 #else
 	stmgeia	ip!, {r2-r3}
 	stmgeia	ip!, {r2-r3}
 #endif
 	RETeq			/* Zero length so just exit */
 
 	addlt	r1, r1, #0x10		/* Possibly adjust for extra sub */
 
 	/* We have at least 4 bytes so copy as words */
 .Lmemset_loop4:
 	subs	r1, r1, #0x04
 	strge	r3, [ip], #0x04
 	bgt	.Lmemset_loop4
 	RETeq			/* Zero length so just exit */
 
-#ifdef __XSCALE__
+#ifdef _ARM_ARCH_5E
 	/* Compensate for 64-bit alignment check */
 	adds	r1, r1, #0x04
 	RETeq
 	cmp	r1, #2
 #else
 	cmp	r1, #-2
 #endif
 
 	strb	r3, [ip], #0x01		/* Set 1 byte */
 	strgeb	r3, [ip], #0x01		/* Set another byte */
 	strgtb	r3, [ip]		/* and a third */
 	RET			/* Exit */
 
 .Lmemset_wordunaligned:
 	rsb	r2, r2, #0x004
 	strb	r3, [ip], #0x01		/* Set 1 byte */
 	cmp	r2, #0x02
 	strgeb	r3, [ip], #0x01		/* Set another byte */
 	sub	r1, r1, r2
 	strgtb	r3, [ip], #0x01		/* and a third */
 	cmp	r1, #0x04		/* More than 4 bytes left? */
 	bge	.Lmemset_wordaligned	/* Yup */
 
 .Lmemset_lessthanfour:
 	cmp	r1, #0x00
 	RETeq			/* Zero length so exit */
 	strb	r3, [ip], #0x01		/* Set 1 byte */
 	cmp	r1, #0x02
 	strgeb	r3, [ip], #0x01		/* Set another byte */
 	strgtb	r3, [ip]		/* and a third */
 	RET			/* Exit */
 
 ENTRY(bcmp)
 	mov	ip, r0
 	cmp	r2, #0x06
 	beq	.Lmemcmp_6bytes
 	mov	r0, #0x00
 
 	/* Are both addresses aligned the same way? */
 	cmp	r2, #0x00
 	eornes	r3, ip, r1
 	RETeq			/* len == 0, or same addresses! */
 	tst	r3, #0x03
 	subne	r2, r2, #0x01
 	bne	.Lmemcmp_bytewise2	/* Badly aligned. Do it the slow way */
 
 	/* Word-align the addresses, if necessary */
 	sub	r3, r1, #0x05
 	ands	r3, r3, #0x03
 	add	r3, r3, r3, lsl #1
 	addne	pc, pc, r3, lsl #3
 	nop
 
 	/* Compare up to 3 bytes */
 	ldrb	r0, [ip], #0x01
 	ldrb	r3, [r1], #0x01
 	subs	r0, r0, r3
 	RETne
 	subs	r2, r2, #0x01
 	RETeq
 
 	/* Compare up to 2 bytes */
 	ldrb	r0, [ip], #0x01
 	ldrb	r3, [r1], #0x01
 	subs	r0, r0, r3
 	RETne
 	subs	r2, r2, #0x01
 	RETeq
 
 	/* Compare 1 byte */
 	ldrb	r0, [ip], #0x01
 	ldrb	r3, [r1], #0x01
 	subs	r0, r0, r3
 	RETne
 	subs	r2, r2, #0x01
 	RETeq
 
 	/* Compare 4 bytes at a time, if possible */
 	subs	r2, r2, #0x04
 	bcc	.Lmemcmp_bytewise
 .Lmemcmp_word_aligned:
 	ldr	r0, [ip], #0x04
 	ldr	r3, [r1], #0x04
 	subs	r2, r2, #0x04
 	cmpcs	r0, r3
 	beq	.Lmemcmp_word_aligned
 	sub	r0, r0, r3
 
 	/* Correct for extra subtraction, and check if done */
 	adds	r2, r2, #0x04
 	cmpeq	r0, #0x00		/* If done, did all bytes match? */
 	RETeq			/* Yup. Just return */
 
 	/* Re-do the final word byte-wise */
 	sub	ip, ip, #0x04
 	sub	r1, r1, #0x04
 
 .Lmemcmp_bytewise:
 	add	r2, r2, #0x03
 .Lmemcmp_bytewise2:
 	ldrb	r0, [ip], #0x01
 	ldrb	r3, [r1], #0x01
 	subs	r2, r2, #0x01
 	cmpcs	r0, r3
 	beq	.Lmemcmp_bytewise2
 	sub	r0, r0, r3
 	RET
 
 	/*
 	 * 6 byte compares are very common, thanks to the network stack.
 	 * This code is hand-scheduled to reduce the number of stalls for
 	 * load results. Everything else being equal, this will be ~32%
 	 * faster than a byte-wise memcmp.
 	 */
 	.align	5
 .Lmemcmp_6bytes:
 	ldrb	r3, [r1, #0x00]		/* r3 = b2#0 */
 	ldrb	r0, [ip, #0x00]		/* r0 = b1#0 */
 	ldrb	r2, [r1, #0x01]		/* r2 = b2#1 */
 	subs	r0, r0, r3		/* r0 = b1#0 - b2#0 */
 	ldreqb	r3, [ip, #0x01]		/* r3 = b1#1 */
 	RETne			/* Return if mismatch on #0 */
 	subs	r0, r3, r2		/* r0 = b1#1 - b2#1 */
 	ldreqb	r3, [r1, #0x02]		/* r3 = b2#2 */
 	ldreqb	r0, [ip, #0x02]		/* r0 = b1#2 */
 	RETne			/* Return if mismatch on #1 */
 	ldrb	r2, [r1, #0x03]		/* r2 = b2#3 */
 	subs	r0, r0, r3		/* r0 = b1#2 - b2#2 */
 	ldreqb	r3, [ip, #0x03]		/* r3 = b1#3 */
 	RETne			/* Return if mismatch on #2 */
 	subs	r0, r3, r2		/* r0 = b1#3 - b2#3 */
 	ldreqb	r3, [r1, #0x04]		/* r3 = b2#4 */
 	ldreqb	r0, [ip, #0x04]		/* r0 = b1#4 */
 	RETne			/* Return if mismatch on #3 */
 	ldrb	r2, [r1, #0x05]		/* r2 = b2#5 */
 	subs	r0, r0, r3		/* r0 = b1#4 - b2#4 */
 	ldreqb	r3, [ip, #0x05]		/* r3 = b1#5 */
 	RETne			/* Return if mismatch on #4 */
 	sub	r0, r3, r2		/* r0 = b1#5 - b2#5 */
 	RET
 
 ENTRY(bcopy)
 	/* switch the source and destination registers */
 	eor     r0, r1, r0 
 	eor     r1, r0, r1 
 	eor     r0, r1, r0 
 ENTRY(memmove)
 	/* Do the buffers overlap? */
 	cmp	r0, r1
 	RETeq		/* Bail now if src/dst are the same */
 	subcc	r3, r0, r1	/* if (dst > src) r3 = dst - src */
 	subcs	r3, r1, r0	/* if (src > dsr) r3 = src - dst */
 	cmp	r3, r2		/* if (r3 < len) we have an overlap */
 	bcc	PIC_SYM(_C_LABEL(memcpy), PLT)
 
 	/* Determine copy direction */
 	cmp	r1, r0
 	bcc	.Lmemmove_backwards
 
 	moveq	r0, #0			/* Quick abort for len=0 */
 	RETeq
 
 	stmdb	sp!, {r0, lr}		/* memmove() returns dest addr */
 	subs	r2, r2, #4
 	blt	.Lmemmove_fl4		/* less than 4 bytes */
 	ands	r12, r0, #3
 	bne	.Lmemmove_fdestul	/* oh unaligned destination addr */
 	ands	r12, r1, #3
 	bne	.Lmemmove_fsrcul		/* oh unaligned source addr */
 
 .Lmemmove_ft8:
 	/* We have aligned source and destination */
 	subs	r2, r2, #8
 	blt	.Lmemmove_fl12		/* less than 12 bytes (4 from above) */
 	subs	r2, r2, #0x14         
 	blt	.Lmemmove_fl32		/* less than 32 bytes (12 from above) */
 	stmdb	sp!, {r4}		/* borrow r4 */
 
 	/* blat 32 bytes at a time */
 	/* XXX for really big copies perhaps we should use more registers */
 .Lmemmove_floop32:	
 	ldmia	r1!, {r3, r4, r12, lr}
 	stmia	r0!, {r3, r4, r12, lr}
 	ldmia	r1!, {r3, r4, r12, lr}
 	stmia	r0!, {r3, r4, r12, lr}
 	subs	r2, r2, #0x20         
 	bge	.Lmemmove_floop32
 
 	cmn	r2, #0x10
 	ldmgeia	r1!, {r3, r4, r12, lr}	/* blat a remaining 16 bytes */
 	stmgeia	r0!, {r3, r4, r12, lr}
 	subge	r2, r2, #0x10         
 	ldmia	sp!, {r4}		/* return r4 */
 
 .Lmemmove_fl32:
 	adds	r2, r2, #0x14         
 
 	/* blat 12 bytes at a time */
 .Lmemmove_floop12:
 	ldmgeia	r1!, {r3, r12, lr}
 	stmgeia	r0!, {r3, r12, lr}
 	subges	r2, r2, #0x0c         
 	bge	.Lmemmove_floop12
 
 .Lmemmove_fl12:
 	adds	r2, r2, #8
 	blt	.Lmemmove_fl4
 
 	subs	r2, r2, #4
 	ldrlt	r3, [r1], #4
 	strlt	r3, [r0], #4
 	ldmgeia	r1!, {r3, r12}
 	stmgeia	r0!, {r3, r12}
 	subge	r2, r2, #4
 
 .Lmemmove_fl4:
 	/* less than 4 bytes to go */
 	adds	r2, r2, #4
 	ldmeqia	sp!, {r0, pc}		/* done */
 
 	/* copy the crud byte at a time */
 	cmp	r2, #2
 	ldrb	r3, [r1], #1
 	strb	r3, [r0], #1
 	ldrgeb	r3, [r1], #1
 	strgeb	r3, [r0], #1
 	ldrgtb	r3, [r1], #1
 	strgtb	r3, [r0], #1
 	ldmia	sp!, {r0, pc}
 
 	/* erg - unaligned destination */
 .Lmemmove_fdestul:
 	rsb	r12, r12, #4
 	cmp	r12, #2
 
 	/* align destination with byte copies */
 	ldrb	r3, [r1], #1
 	strb	r3, [r0], #1
 	ldrgeb	r3, [r1], #1
 	strgeb	r3, [r0], #1
 	ldrgtb	r3, [r1], #1
 	strgtb	r3, [r0], #1
 	subs	r2, r2, r12
 	blt	.Lmemmove_fl4		/* less the 4 bytes */
 
 	ands	r12, r1, #3
 	beq	.Lmemmove_ft8		/* we have an aligned source */
 
 	/* erg - unaligned source */
 	/* This is where it gets nasty ... */
 .Lmemmove_fsrcul:
 	bic	r1, r1, #3
 	ldr	lr, [r1], #4
 	cmp	r12, #2
 	bgt	.Lmemmove_fsrcul3
 	beq	.Lmemmove_fsrcul2
 	cmp	r2, #0x0c            
 	blt	.Lmemmove_fsrcul1loop4
 	sub	r2, r2, #0x0c         
 	stmdb	sp!, {r4, r5}
 
 .Lmemmove_fsrcul1loop16:
 #ifdef __ARMEB__
 	mov	r3, lr, lsl #8
 #else
 	mov	r3, lr, lsr #8
 #endif
 	ldmia	r1!, {r4, r5, r12, lr}
 #ifdef __ARMEB__
 	orr	r3, r3, r4, lsr #24
 	mov	r4, r4, lsl #8
 	orr	r4, r4, r5, lsr #24
 	mov	r5, r5, lsl #8
 	orr	r5, r5, r12, lsr #24
 	mov	r12, r12, lsl #8
 	orr	r12, r12, lr, lsr #24
 #else
 	orr	r3, r3, r4, lsl #24
 	mov	r4, r4, lsr #8
 	orr	r4, r4, r5, lsl #24
 	mov	r5, r5, lsr #8
 	orr	r5, r5, r12, lsl #24
 	mov	r12, r12, lsr #8
 	orr	r12, r12, lr, lsl #24
 #endif
 	stmia	r0!, {r3-r5, r12}
 	subs	r2, r2, #0x10         
 	bge	.Lmemmove_fsrcul1loop16
 	ldmia	sp!, {r4, r5}
 	adds	r2, r2, #0x0c         
 	blt	.Lmemmove_fsrcul1l4
 
 .Lmemmove_fsrcul1loop4:
 #ifdef __ARMEB__
 	mov	r12, lr, lsl #8
 #else
 	mov	r12, lr, lsr #8
 #endif
 	ldr	lr, [r1], #4
 #ifdef __ARMEB__
 	orr	r12, r12, lr, lsr #24
 #else
 	orr	r12, r12, lr, lsl #24
 #endif
 	str	r12, [r0], #4
 	subs	r2, r2, #4
 	bge	.Lmemmove_fsrcul1loop4
 
 .Lmemmove_fsrcul1l4:
 	sub	r1, r1, #3
 	b	.Lmemmove_fl4
 
 .Lmemmove_fsrcul2:
 	cmp	r2, #0x0c            
 	blt	.Lmemmove_fsrcul2loop4
 	sub	r2, r2, #0x0c         
 	stmdb	sp!, {r4, r5}
 
 .Lmemmove_fsrcul2loop16:
 #ifdef __ARMEB__
 	mov	r3, lr, lsl #16
 #else
 	mov	r3, lr, lsr #16
 #endif
 	ldmia	r1!, {r4, r5, r12, lr}
 #ifdef __ARMEB__
 	orr	r3, r3, r4, lsr #16
 	mov	r4, r4, lsl #16
 	orr	r4, r4, r5, lsr #16
 	mov	r5, r5, lsl #16
 	orr	r5, r5, r12, lsr #16
 	mov	r12, r12, lsl #16
 	orr	r12, r12, lr, lsr #16
 #else
 	orr	r3, r3, r4, lsl #16
 	mov	r4, r4, lsr #16
 	orr	r4, r4, r5, lsl #16
 	mov	r5, r5, lsr #16
 	orr	r5, r5, r12, lsl #16
 	mov	r12, r12, lsr #16
 	orr	r12, r12, lr, lsl #16
 #endif
 	stmia	r0!, {r3-r5, r12}
 	subs	r2, r2, #0x10         
 	bge	.Lmemmove_fsrcul2loop16
 	ldmia	sp!, {r4, r5}
 	adds	r2, r2, #0x0c         
 	blt	.Lmemmove_fsrcul2l4
 
 .Lmemmove_fsrcul2loop4:
 #ifdef __ARMEB__
 	mov	r12, lr, lsl #16
 #else
 	mov	r12, lr, lsr #16
 #endif
 	ldr	lr, [r1], #4
 #ifdef __ARMEB__
 	orr	r12, r12, lr, lsr #16
 #else
 	orr	r12, r12, lr, lsl #16
 #endif
 	str	r12, [r0], #4
 	subs	r2, r2, #4
 	bge	.Lmemmove_fsrcul2loop4
 
 .Lmemmove_fsrcul2l4:
 	sub	r1, r1, #2
 	b	.Lmemmove_fl4
 
 .Lmemmove_fsrcul3:
 	cmp	r2, #0x0c            
 	blt	.Lmemmove_fsrcul3loop4
 	sub	r2, r2, #0x0c         
 	stmdb	sp!, {r4, r5}
 
 .Lmemmove_fsrcul3loop16:
 #ifdef __ARMEB__
 	mov	r3, lr, lsl #24
 #else
 	mov	r3, lr, lsr #24
 #endif
 	ldmia	r1!, {r4, r5, r12, lr}
 #ifdef __ARMEB__
 	orr	r3, r3, r4, lsr #8
 	mov	r4, r4, lsl #24
 	orr	r4, r4, r5, lsr #8
 	mov	r5, r5, lsl #24
 	orr	r5, r5, r12, lsr #8
 	mov	r12, r12, lsl #24
 	orr	r12, r12, lr, lsr #8
 #else
 	orr	r3, r3, r4, lsl #8
 	mov	r4, r4, lsr #24
 	orr	r4, r4, r5, lsl #8
 	mov	r5, r5, lsr #24
 	orr	r5, r5, r12, lsl #8
 	mov	r12, r12, lsr #24
 	orr	r12, r12, lr, lsl #8
 #endif
 	stmia	r0!, {r3-r5, r12}
 	subs	r2, r2, #0x10         
 	bge	.Lmemmove_fsrcul3loop16
 	ldmia	sp!, {r4, r5}
 	adds	r2, r2, #0x0c         
 	blt	.Lmemmove_fsrcul3l4
 
 .Lmemmove_fsrcul3loop4:
 #ifdef __ARMEB__
 	mov	r12, lr, lsl #24
 #else
 	mov	r12, lr, lsr #24
 #endif
 	ldr	lr, [r1], #4
 #ifdef __ARMEB__
 	orr	r12, r12, lr, lsr #8
 #else
 	orr	r12, r12, lr, lsl #8
 #endif
 	str	r12, [r0], #4
 	subs	r2, r2, #4
 	bge	.Lmemmove_fsrcul3loop4
 
 .Lmemmove_fsrcul3l4:
 	sub	r1, r1, #1
 	b	.Lmemmove_fl4
 
 .Lmemmove_backwards:
 	add	r1, r1, r2
 	add	r0, r0, r2
 	subs	r2, r2, #4
 	blt	.Lmemmove_bl4		/* less than 4 bytes */
 	ands	r12, r0, #3
 	bne	.Lmemmove_bdestul	/* oh unaligned destination addr */
 	ands	r12, r1, #3
 	bne	.Lmemmove_bsrcul		/* oh unaligned source addr */
 
 .Lmemmove_bt8:
 	/* We have aligned source and destination */
 	subs	r2, r2, #8
 	blt	.Lmemmove_bl12		/* less than 12 bytes (4 from above) */
 	stmdb	sp!, {r4, lr}
 	subs	r2, r2, #0x14		/* less than 32 bytes (12 from above) */
 	blt	.Lmemmove_bl32
 
 	/* blat 32 bytes at a time */
 	/* XXX for really big copies perhaps we should use more registers */
 .Lmemmove_bloop32:
 	ldmdb	r1!, {r3, r4, r12, lr}
 	stmdb	r0!, {r3, r4, r12, lr}
 	ldmdb	r1!, {r3, r4, r12, lr}
 	stmdb	r0!, {r3, r4, r12, lr}
 	subs	r2, r2, #0x20         
 	bge	.Lmemmove_bloop32
 
 .Lmemmove_bl32:
 	cmn	r2, #0x10            
 	ldmgedb	r1!, {r3, r4, r12, lr}	/* blat a remaining 16 bytes */
 	stmgedb	r0!, {r3, r4, r12, lr}
 	subge	r2, r2, #0x10         
 	adds	r2, r2, #0x14         
 	ldmgedb	r1!, {r3, r12, lr}	/* blat a remaining 12 bytes */
 	stmgedb	r0!, {r3, r12, lr}
 	subge	r2, r2, #0x0c         
 	ldmia	sp!, {r4, lr}
 
 .Lmemmove_bl12:
 	adds	r2, r2, #8
 	blt	.Lmemmove_bl4
 	subs	r2, r2, #4
 	ldrlt	r3, [r1, #-4]!
 	strlt	r3, [r0, #-4]!
 	ldmgedb	r1!, {r3, r12}
 	stmgedb	r0!, {r3, r12}
 	subge	r2, r2, #4
 
 .Lmemmove_bl4:
 	/* less than 4 bytes to go */
 	adds	r2, r2, #4
 	RETeq			/* done */
 
 	/* copy the crud byte at a time */
 	cmp	r2, #2
 	ldrb	r3, [r1, #-1]!
 	strb	r3, [r0, #-1]!
 	ldrgeb	r3, [r1, #-1]!
 	strgeb	r3, [r0, #-1]!
 	ldrgtb	r3, [r1, #-1]!
 	strgtb	r3, [r0, #-1]!
 	RET
 
 	/* erg - unaligned destination */
 .Lmemmove_bdestul:
 	cmp	r12, #2
 
 	/* align destination with byte copies */
 	ldrb	r3, [r1, #-1]!
 	strb	r3, [r0, #-1]!
 	ldrgeb	r3, [r1, #-1]!
 	strgeb	r3, [r0, #-1]!
 	ldrgtb	r3, [r1, #-1]!
 	strgtb	r3, [r0, #-1]!
 	subs	r2, r2, r12
 	blt	.Lmemmove_bl4		/* less than 4 bytes to go */
 	ands	r12, r1, #3
 	beq	.Lmemmove_bt8		/* we have an aligned source */
 
 	/* erg - unaligned source */
 	/* This is where it gets nasty ... */
 .Lmemmove_bsrcul:
 	bic	r1, r1, #3
 	ldr	r3, [r1, #0]
 	cmp	r12, #2
 	blt	.Lmemmove_bsrcul1
 	beq	.Lmemmove_bsrcul2
 	cmp	r2, #0x0c            
 	blt	.Lmemmove_bsrcul3loop4
 	sub	r2, r2, #0x0c         
 	stmdb	sp!, {r4, r5, lr}
 
 .Lmemmove_bsrcul3loop16:
 #ifdef __ARMEB__
 	mov	lr, r3, lsr #8
 #else
 	mov	lr, r3, lsl #8
 #endif
 	ldmdb	r1!, {r3-r5, r12}
 #ifdef __ARMEB__
 	orr	lr, lr, r12, lsl #24
 	mov	r12, r12, lsr #8
 	orr	r12, r12, r5, lsl #24
 	mov	r5, r5, lsr #8
 	orr	r5, r5, r4, lsl #24
 	mov	r4, r4, lsr #8
 	orr	r4, r4, r3, lsl #24
 #else
 	orr	lr, lr, r12, lsr #24
 	mov	r12, r12, lsl #8
 	orr	r12, r12, r5, lsr #24
 	mov	r5, r5, lsl #8
 	orr	r5, r5, r4, lsr #24
 	mov	r4, r4, lsl #8
 	orr	r4, r4, r3, lsr #24
 #endif
 	stmdb	r0!, {r4, r5, r12, lr}
 	subs	r2, r2, #0x10         
 	bge	.Lmemmove_bsrcul3loop16
 	ldmia	sp!, {r4, r5, lr}
 	adds	r2, r2, #0x0c         
 	blt	.Lmemmove_bsrcul3l4
 
 .Lmemmove_bsrcul3loop4:
 #ifdef __ARMEB__
 	mov	r12, r3, lsr #8
 #else
 	mov	r12, r3, lsl #8
 #endif
 	ldr	r3, [r1, #-4]!
 #ifdef __ARMEB__
 	orr	r12, r12, r3, lsl #24
 #else
 	orr	r12, r12, r3, lsr #24
 #endif
 	str	r12, [r0, #-4]!
 	subs	r2, r2, #4
 	bge	.Lmemmove_bsrcul3loop4
 
 .Lmemmove_bsrcul3l4:
 	add	r1, r1, #3
 	b	.Lmemmove_bl4
 
 .Lmemmove_bsrcul2:
 	cmp	r2, #0x0c            
 	blt	.Lmemmove_bsrcul2loop4
 	sub	r2, r2, #0x0c         
 	stmdb	sp!, {r4, r5, lr}
 
 .Lmemmove_bsrcul2loop16:
 #ifdef __ARMEB__
 	mov	lr, r3, lsr #16
 #else
 	mov	lr, r3, lsl #16
 #endif
 	ldmdb	r1!, {r3-r5, r12}
 #ifdef __ARMEB__
 	orr	lr, lr, r12, lsl #16
 	mov	r12, r12, lsr #16
 	orr	r12, r12, r5, lsl #16
 	mov	r5, r5, lsr #16
 	orr	r5, r5, r4, lsl #16
 	mov	r4, r4, lsr #16
 	orr	r4, r4, r3, lsl #16
 #else
 	orr	lr, lr, r12, lsr #16
 	mov	r12, r12, lsl #16
 	orr	r12, r12, r5, lsr #16
 	mov	r5, r5, lsl #16
 	orr	r5, r5, r4, lsr #16
 	mov	r4, r4, lsl #16
 	orr	r4, r4, r3, lsr #16
 #endif
 	stmdb	r0!, {r4, r5, r12, lr}
 	subs	r2, r2, #0x10         
 	bge	.Lmemmove_bsrcul2loop16
 	ldmia	sp!, {r4, r5, lr}
 	adds	r2, r2, #0x0c         
 	blt	.Lmemmove_bsrcul2l4
 
 .Lmemmove_bsrcul2loop4:
 #ifdef __ARMEB__
 	mov	r12, r3, lsr #16
 #else
 	mov	r12, r3, lsl #16
 #endif
 	ldr	r3, [r1, #-4]!
 #ifdef __ARMEB__
 	orr	r12, r12, r3, lsl #16
 #else
 	orr	r12, r12, r3, lsr #16
 #endif
 	str	r12, [r0, #-4]!
 	subs	r2, r2, #4
 	bge	.Lmemmove_bsrcul2loop4
 
 .Lmemmove_bsrcul2l4:
 	add	r1, r1, #2
 	b	.Lmemmove_bl4
 
 .Lmemmove_bsrcul1:
 	cmp	r2, #0x0c            
 	blt	.Lmemmove_bsrcul1loop4
 	sub	r2, r2, #0x0c         
 	stmdb	sp!, {r4, r5, lr}
 
 .Lmemmove_bsrcul1loop32:
 #ifdef __ARMEB__
 	mov	lr, r3, lsr #24
 #else
 	mov	lr, r3, lsl #24
 #endif
 	ldmdb	r1!, {r3-r5, r12}
 #ifdef __ARMEB__
 	orr	lr, lr, r12, lsl #8
 	mov	r12, r12, lsr #24
 	orr	r12, r12, r5, lsl #8
 	mov	r5, r5, lsr #24
 	orr	r5, r5, r4, lsl #8
 	mov	r4, r4, lsr #24
 	orr	r4, r4, r3, lsl #8
 #else
 	orr	lr, lr, r12, lsr #8
 	mov	r12, r12, lsl #24
 	orr	r12, r12, r5, lsr #8
 	mov	r5, r5, lsl #24
 	orr	r5, r5, r4, lsr #8
 	mov	r4, r4, lsl #24
 	orr	r4, r4, r3, lsr #8
 #endif
 	stmdb	r0!, {r4, r5, r12, lr}
 	subs	r2, r2, #0x10         
 	bge	.Lmemmove_bsrcul1loop32
 	ldmia	sp!, {r4, r5, lr}
 	adds	r2, r2, #0x0c         
 	blt	.Lmemmove_bsrcul1l4
 
 .Lmemmove_bsrcul1loop4:
 #ifdef __ARMEB__
 	mov	r12, r3, lsr #24
 #else
 	mov	r12, r3, lsl #24
 #endif
 	ldr	r3, [r1, #-4]!
 #ifdef __ARMEB__
 	orr	r12, r12, r3, lsl #8
 #else
 	orr	r12, r12, r3, lsr #8
 #endif
 	str	r12, [r0, #-4]!
 	subs	r2, r2, #4
 	bge	.Lmemmove_bsrcul1loop4
 
 .Lmemmove_bsrcul1l4:
 	add	r1, r1, #1
 	b	.Lmemmove_bl4
 
-#if !defined(__XSCALE__)
+#if !defined(_ARM_ARCH_5E)
 ENTRY(memcpy)
 	/* save leaf functions having to store this away */
 	/* Do not check arm_memcpy if we're running from flash */
 #ifdef FLASHADDR
 #if FLASHADDR > PHYSADDR
 	ldr	r3, =FLASHADDR
 	cmp	r3, pc
 	bls	.Lnormal
 #else
 	ldr	r3, =FLASHADDR
 	cmp	r3, pc
 	bhi	.Lnormal
 #endif
 #endif
 	ldr	r3, .L_arm_memcpy
 	ldr	r3, [r3]
 	cmp	r3, #0
 	beq	.Lnormal
 	ldr	r3, .L_min_memcpy_size
 	ldr	r3, [r3]
 	cmp	r2, r3
 	blt	.Lnormal
 	stmfd	sp!, {r0-r2, r4, lr}
 	mov	r3, #0
 	ldr	r4, .L_arm_memcpy
 	mov	lr, pc
 	ldr	pc, [r4]
 	cmp	r0, #0
 	ldmfd	sp!, {r0-r2, r4, lr}
 	RETeq
 
 .Lnormal:
 	stmdb	sp!, {r0, lr}		/* memcpy() returns dest addr */
 
 	subs	r2, r2, #4
 	blt	.Lmemcpy_l4		/* less than 4 bytes */
 	ands	r12, r0, #3
 	bne	.Lmemcpy_destul		/* oh unaligned destination addr */
 	ands	r12, r1, #3
 	bne	.Lmemcpy_srcul		/* oh unaligned source addr */
 
 .Lmemcpy_t8:
 	/* We have aligned source and destination */
 	subs	r2, r2, #8
 	blt	.Lmemcpy_l12		/* less than 12 bytes (4 from above) */
 	subs	r2, r2, #0x14         
 	blt	.Lmemcpy_l32		/* less than 32 bytes (12 from above) */
 	stmdb	sp!, {r4}		/* borrow r4 */
 
 	/* blat 32 bytes at a time */
 	/* XXX for really big copies perhaps we should use more registers */
 .Lmemcpy_loop32:	
 	ldmia	r1!, {r3, r4, r12, lr}
 	stmia	r0!, {r3, r4, r12, lr}
 	ldmia	r1!, {r3, r4, r12, lr}
 	stmia	r0!, {r3, r4, r12, lr}
 	subs	r2, r2, #0x20         
 	bge	.Lmemcpy_loop32
 
 	cmn	r2, #0x10
 	ldmgeia	r1!, {r3, r4, r12, lr}	/* blat a remaining 16 bytes */
 	stmgeia	r0!, {r3, r4, r12, lr}
 	subge	r2, r2, #0x10         
 	ldmia	sp!, {r4}		/* return r4 */
 
 .Lmemcpy_l32:
 	adds	r2, r2, #0x14         
 
 	/* blat 12 bytes at a time */
 .Lmemcpy_loop12:
 	ldmgeia	r1!, {r3, r12, lr}
 	stmgeia	r0!, {r3, r12, lr}
 	subges	r2, r2, #0x0c         
 	bge	.Lmemcpy_loop12
 
 .Lmemcpy_l12:
 	adds	r2, r2, #8
 	blt	.Lmemcpy_l4
 
 	subs	r2, r2, #4
 	ldrlt	r3, [r1], #4
 	strlt	r3, [r0], #4
 	ldmgeia	r1!, {r3, r12}
 	stmgeia	r0!, {r3, r12}
 	subge	r2, r2, #4
 
 .Lmemcpy_l4:
 	/* less than 4 bytes to go */
 	adds	r2, r2, #4
 #ifdef __APCS_26_
 	ldmeqia sp!, {r0, pc}^		/* done */
 #else
 	ldmeqia	sp!, {r0, pc}		/* done */
 #endif
 	/* copy the crud byte at a time */
 	cmp	r2, #2
 	ldrb	r3, [r1], #1
 	strb	r3, [r0], #1
 	ldrgeb	r3, [r1], #1
 	strgeb	r3, [r0], #1
 	ldrgtb	r3, [r1], #1
 	strgtb	r3, [r0], #1
 	ldmia	sp!, {r0, pc}
 
 	/* erg - unaligned destination */
 .Lmemcpy_destul:
 	rsb	r12, r12, #4
 	cmp	r12, #2
 
 	/* align destination with byte copies */
 	ldrb	r3, [r1], #1
 	strb	r3, [r0], #1
 	ldrgeb	r3, [r1], #1
 	strgeb	r3, [r0], #1
 	ldrgtb	r3, [r1], #1
 	strgtb	r3, [r0], #1
 	subs	r2, r2, r12
 	blt	.Lmemcpy_l4		/* less the 4 bytes */
 
 	ands	r12, r1, #3
 	beq	.Lmemcpy_t8		/* we have an aligned source */
 
 	/* erg - unaligned source */
 	/* This is where it gets nasty ... */
 .Lmemcpy_srcul:
 	bic	r1, r1, #3
 	ldr	lr, [r1], #4
 	cmp	r12, #2
 	bgt	.Lmemcpy_srcul3
 	beq	.Lmemcpy_srcul2
 	cmp	r2, #0x0c            
 	blt	.Lmemcpy_srcul1loop4
 	sub	r2, r2, #0x0c         
 	stmdb	sp!, {r4, r5}
 
 .Lmemcpy_srcul1loop16:
 	mov	r3, lr, lsr #8
 	ldmia	r1!, {r4, r5, r12, lr}
 	orr	r3, r3, r4, lsl #24
 	mov	r4, r4, lsr #8
 	orr	r4, r4, r5, lsl #24
 	mov	r5, r5, lsr #8
 	orr	r5, r5, r12, lsl #24
 	mov	r12, r12, lsr #8
 	orr	r12, r12, lr, lsl #24
 	stmia	r0!, {r3-r5, r12}
 	subs	r2, r2, #0x10         
 	bge	.Lmemcpy_srcul1loop16
 	ldmia	sp!, {r4, r5}
 	adds	r2, r2, #0x0c         
 	blt	.Lmemcpy_srcul1l4
 
 .Lmemcpy_srcul1loop4:
 	mov	r12, lr, lsr #8
 	ldr	lr, [r1], #4
 	orr	r12, r12, lr, lsl #24
 	str	r12, [r0], #4
 	subs	r2, r2, #4
 	bge	.Lmemcpy_srcul1loop4
 
 .Lmemcpy_srcul1l4:
 	sub	r1, r1, #3
 	b	.Lmemcpy_l4
 
 .Lmemcpy_srcul2:
 	cmp	r2, #0x0c            
 	blt	.Lmemcpy_srcul2loop4
 	sub	r2, r2, #0x0c         
 	stmdb	sp!, {r4, r5}
 
 .Lmemcpy_srcul2loop16:
 	mov	r3, lr, lsr #16
 	ldmia	r1!, {r4, r5, r12, lr}
 	orr	r3, r3, r4, lsl #16
 	mov	r4, r4, lsr #16
 	orr	r4, r4, r5, lsl #16
 	mov	r5, r5, lsr #16
 	orr	r5, r5, r12, lsl #16
 	mov	r12, r12, lsr #16
 	orr	r12, r12, lr, lsl #16
 	stmia	r0!, {r3-r5, r12}
 	subs	r2, r2, #0x10         
 	bge	.Lmemcpy_srcul2loop16
 	ldmia	sp!, {r4, r5}
 	adds	r2, r2, #0x0c         
 	blt	.Lmemcpy_srcul2l4
 
 .Lmemcpy_srcul2loop4:
 	mov	r12, lr, lsr #16
 	ldr	lr, [r1], #4
 	orr	r12, r12, lr, lsl #16
 	str	r12, [r0], #4
 	subs	r2, r2, #4
 	bge	.Lmemcpy_srcul2loop4
 
 .Lmemcpy_srcul2l4:
 	sub	r1, r1, #2
 	b	.Lmemcpy_l4
 
 .Lmemcpy_srcul3:
 	cmp	r2, #0x0c            
 	blt	.Lmemcpy_srcul3loop4
 	sub	r2, r2, #0x0c         
 	stmdb	sp!, {r4, r5}
 
 .Lmemcpy_srcul3loop16:
 	mov	r3, lr, lsr #24
 	ldmia	r1!, {r4, r5, r12, lr}
 	orr	r3, r3, r4, lsl #8
 	mov	r4, r4, lsr #24
 	orr	r4, r4, r5, lsl #8
 	mov	r5, r5, lsr #24
 	orr	r5, r5, r12, lsl #8
 	mov	r12, r12, lsr #24
 	orr	r12, r12, lr, lsl #8
 	stmia	r0!, {r3-r5, r12}
 	subs	r2, r2, #0x10         
 	bge	.Lmemcpy_srcul3loop16
 	ldmia	sp!, {r4, r5}
 	adds	r2, r2, #0x0c         
 	blt	.Lmemcpy_srcul3l4
 
 .Lmemcpy_srcul3loop4:
 	mov	r12, lr, lsr #24
 	ldr	lr, [r1], #4
 	orr	r12, r12, lr, lsl #8
 	str	r12, [r0], #4
 	subs	r2, r2, #4
 	bge	.Lmemcpy_srcul3loop4
 
 .Lmemcpy_srcul3l4:
 	sub	r1, r1, #1
 	b	.Lmemcpy_l4
 #else
 /* LINTSTUB: Func: void *memcpy(void *dst, const void *src, size_t len) */
 ENTRY(memcpy)
 	pld	[r1]
 	cmp	r2, #0x0c
 	ble	.Lmemcpy_short		/* <= 12 bytes */
 #ifdef FLASHADDR
 #if FLASHADDR > PHYSADDR
 	ldr	r3, =FLASHADDR
 	cmp	r3, pc
 	bls	.Lnormal
 #else
 	ldr	r3, =FLASHADDR
 	cmp	r3, pc
 	bhi	.Lnormal
 #endif
 #endif
 	ldr	r3, .L_arm_memcpy
 	ldr	r3, [r3]
 	cmp	r3, #0
 	beq	.Lnormal
 	ldr	r3, .L_min_memcpy_size
 	ldr	r3, [r3]
 	cmp	r2, r3
 	blt	.Lnormal
 	stmfd	sp!, {r0-r2, r4, lr}
 	mov	r3, #0
 	ldr	r4, .L_arm_memcpy
 	mov	lr, pc
 	ldr	pc, [r4]
 	cmp	r0, #0
 	ldmfd	sp!, {r0-r2, r4, lr}
 	RETeq
 .Lnormal:
 	mov	r3, r0			/* We must not clobber r0 */
 
 	/* Word-align the destination buffer */
 	ands	ip, r3, #0x03		/* Already word aligned? */
 	beq	.Lmemcpy_wordaligned	/* Yup */
 	cmp	ip, #0x02
 	ldrb	ip, [r1], #0x01
 	sub	r2, r2, #0x01
 	strb	ip, [r3], #0x01
 	ldrleb	ip, [r1], #0x01
 	suble	r2, r2, #0x01
 	strleb	ip, [r3], #0x01
 	ldrltb	ip, [r1], #0x01
 	sublt	r2, r2, #0x01
 	strltb	ip, [r3], #0x01
 
 	/* Destination buffer is now word aligned */
 .Lmemcpy_wordaligned:
 	ands	ip, r1, #0x03		/* Is src also word-aligned? */
 	bne	.Lmemcpy_bad_align	/* Nope. Things just got bad */
 
 	/* Quad-align the destination buffer */
 	tst	r3, #0x07		/* Already quad aligned? */
 	ldrne	ip, [r1], #0x04
 	stmfd	sp!, {r4-r9}		/* Free up some registers */
 	subne	r2, r2, #0x04
 	strne	ip, [r3], #0x04
 
 	/* Destination buffer quad aligned, source is at least word aligned */
 	subs	r2, r2, #0x80
 	blt	.Lmemcpy_w_lessthan128
 
 	/* Copy 128 bytes at a time */
 .Lmemcpy_w_loop128:
 	ldr	r4, [r1], #0x04		/* LD:00-03 */
 	ldr	r5, [r1], #0x04		/* LD:04-07 */
 	pld	[r1, #0x18]		/* Prefetch 0x20 */
 	ldr	r6, [r1], #0x04		/* LD:08-0b */
 	ldr	r7, [r1], #0x04		/* LD:0c-0f */
 	ldr	r8, [r1], #0x04		/* LD:10-13 */
 	ldr	r9, [r1], #0x04		/* LD:14-17 */
 	strd	r4, [r3], #0x08		/* ST:00-07 */
 	ldr	r4, [r1], #0x04		/* LD:18-1b */
 	ldr	r5, [r1], #0x04		/* LD:1c-1f */
 	strd	r6, [r3], #0x08		/* ST:08-0f */
 	ldr	r6, [r1], #0x04		/* LD:20-23 */
 	ldr	r7, [r1], #0x04		/* LD:24-27 */
 	pld	[r1, #0x18]		/* Prefetch 0x40 */
 	strd	r8, [r3], #0x08		/* ST:10-17 */
 	ldr	r8, [r1], #0x04		/* LD:28-2b */
 	ldr	r9, [r1], #0x04		/* LD:2c-2f */
 	strd	r4, [r3], #0x08		/* ST:18-1f */
 	ldr	r4, [r1], #0x04		/* LD:30-33 */
 	ldr	r5, [r1], #0x04		/* LD:34-37 */
 	strd	r6, [r3], #0x08		/* ST:20-27 */
 	ldr	r6, [r1], #0x04		/* LD:38-3b */
 	ldr	r7, [r1], #0x04		/* LD:3c-3f */
 	strd	r8, [r3], #0x08		/* ST:28-2f */
 	ldr	r8, [r1], #0x04		/* LD:40-43 */
 	ldr	r9, [r1], #0x04		/* LD:44-47 */
 	pld	[r1, #0x18]		/* Prefetch 0x60 */
 	strd	r4, [r3], #0x08		/* ST:30-37 */
 	ldr	r4, [r1], #0x04		/* LD:48-4b */
 	ldr	r5, [r1], #0x04		/* LD:4c-4f */
 	strd	r6, [r3], #0x08		/* ST:38-3f */
 	ldr	r6, [r1], #0x04		/* LD:50-53 */
 	ldr	r7, [r1], #0x04		/* LD:54-57 */
 	strd	r8, [r3], #0x08		/* ST:40-47 */
 	ldr	r8, [r1], #0x04		/* LD:58-5b */
 	ldr	r9, [r1], #0x04		/* LD:5c-5f */
 	strd	r4, [r3], #0x08		/* ST:48-4f */
 	ldr	r4, [r1], #0x04		/* LD:60-63 */
 	ldr	r5, [r1], #0x04		/* LD:64-67 */
 	pld	[r1, #0x18]		/* Prefetch 0x80 */
 	strd	r6, [r3], #0x08		/* ST:50-57 */
 	ldr	r6, [r1], #0x04		/* LD:68-6b */
 	ldr	r7, [r1], #0x04		/* LD:6c-6f */
 	strd	r8, [r3], #0x08		/* ST:58-5f */
 	ldr	r8, [r1], #0x04		/* LD:70-73 */
 	ldr	r9, [r1], #0x04		/* LD:74-77 */
 	strd	r4, [r3], #0x08		/* ST:60-67 */
 	ldr	r4, [r1], #0x04		/* LD:78-7b */
 	ldr	r5, [r1], #0x04		/* LD:7c-7f */
 	strd	r6, [r3], #0x08		/* ST:68-6f */
 	strd	r8, [r3], #0x08		/* ST:70-77 */
 	subs	r2, r2, #0x80
 	strd	r4, [r3], #0x08		/* ST:78-7f */
 	bge	.Lmemcpy_w_loop128
 
 .Lmemcpy_w_lessthan128:
 	adds	r2, r2, #0x80		/* Adjust for extra sub */
 	ldmeqfd	sp!, {r4-r9}
 	RETeq			/* Return now if done */
 	subs	r2, r2, #0x20
 	blt	.Lmemcpy_w_lessthan32
 
 	/* Copy 32 bytes at a time */
 .Lmemcpy_w_loop32:
 	ldr	r4, [r1], #0x04
 	ldr	r5, [r1], #0x04
 	pld	[r1, #0x18]
 	ldr	r6, [r1], #0x04
 	ldr	r7, [r1], #0x04
 	ldr	r8, [r1], #0x04
 	ldr	r9, [r1], #0x04
 	strd	r4, [r3], #0x08
 	ldr	r4, [r1], #0x04
 	ldr	r5, [r1], #0x04
 	strd	r6, [r3], #0x08
 	strd	r8, [r3], #0x08
 	subs	r2, r2, #0x20
 	strd	r4, [r3], #0x08
 	bge	.Lmemcpy_w_loop32
 
 .Lmemcpy_w_lessthan32:
 	adds	r2, r2, #0x20		/* Adjust for extra sub */
 	ldmeqfd	sp!, {r4-r9}
 	RETeq			/* Return now if done */
 
 	and	r4, r2, #0x18
 	rsbs	r4, r4, #0x18
 	addne	pc, pc, r4, lsl #1
 	nop
 
 	/* At least 24 bytes remaining */
 	ldr	r4, [r1], #0x04
 	ldr	r5, [r1], #0x04
 	sub	r2, r2, #0x08
 	strd	r4, [r3], #0x08
 
 	/* At least 16 bytes remaining */
 	ldr	r4, [r1], #0x04
 	ldr	r5, [r1], #0x04
 	sub	r2, r2, #0x08
 	strd	r4, [r3], #0x08
 
 	/* At least 8 bytes remaining */
 	ldr	r4, [r1], #0x04
 	ldr	r5, [r1], #0x04
 	subs	r2, r2, #0x08
 	strd	r4, [r3], #0x08
 
 	/* Less than 8 bytes remaining */
 	ldmfd	sp!, {r4-r9}
 	RETeq			/* Return now if done */
 	subs	r2, r2, #0x04
 	ldrge	ip, [r1], #0x04
 	strge	ip, [r3], #0x04
 	RETeq			/* Return now if done */
 	addlt	r2, r2, #0x04
 	ldrb	ip, [r1], #0x01
 	cmp	r2, #0x02
 	ldrgeb	r2, [r1], #0x01
 	strb	ip, [r3], #0x01
 	ldrgtb	ip, [r1]
 	strgeb	r2, [r3], #0x01
 	strgtb	ip, [r3]
 	RET
 
 
 /*
  * At this point, it has not been possible to word align both buffers.
  * The destination buffer is word aligned, but the source buffer is not.
  */
 .Lmemcpy_bad_align:
 	stmfd	sp!, {r4-r7}
 	bic	r1, r1, #0x03
 	cmp	ip, #2
 	ldr	ip, [r1], #0x04
 	bgt	.Lmemcpy_bad3
 	beq	.Lmemcpy_bad2
 	b	.Lmemcpy_bad1
 
 .Lmemcpy_bad1_loop16:
 #ifdef __ARMEB__
 	mov	r4, ip, lsl #8
 #else
 	mov	r4, ip, lsr #8
 #endif
 	ldr	r5, [r1], #0x04
 	pld	[r1, #0x018]
 	ldr	r6, [r1], #0x04
 	ldr	r7, [r1], #0x04
 	ldr	ip, [r1], #0x04
 #ifdef __ARMEB__
 	orr	r4, r4, r5, lsr #24
 	mov	r5, r5, lsl #8
 	orr	r5, r5, r6, lsr #24
 	mov	r6, r6, lsl #8
 	orr	r6, r6, r7, lsr #24
 	mov	r7, r7, lsl #8
 	orr	r7, r7, ip, lsr #24
 #else
 	orr	r4, r4, r5, lsl #24
 	mov	r5, r5, lsr #8
 	orr	r5, r5, r6, lsl #24
 	mov	r6, r6, lsr #8
 	orr	r6, r6, r7, lsl #24
 	mov	r7, r7, lsr #8
 	orr	r7, r7, ip, lsl #24
 #endif
 	str	r4, [r3], #0x04
 	str	r5, [r3], #0x04
 	str	r6, [r3], #0x04
 	str	r7, [r3], #0x04
 .Lmemcpy_bad1:
 	subs	r2, r2, #0x10         
 	bge	.Lmemcpy_bad1_loop16
 
 	adds	r2, r2, #0x10         
 	ldmeqfd	sp!, {r4-r7}
 	RETeq			/* Return now if done */
 	subs	r2, r2, #0x04
 	sublt	r1, r1, #0x03
 	blt	.Lmemcpy_bad_done
 
 .Lmemcpy_bad1_loop4:
 #ifdef __ARMEB__
 	mov	r4, ip, lsl #8
 #else
 	mov	r4, ip, lsr #8
 #endif
 	ldr	ip, [r1], #0x04
 	subs	r2, r2, #0x04
 #ifdef __ARMEB__
 	orr	r4, r4, ip, lsr #24
 #else
 	orr	r4, r4, ip, lsl #24
 #endif
 	str	r4, [r3], #0x04
 	bge	.Lmemcpy_bad1_loop4
 	sub	r1, r1, #0x03
 	b	.Lmemcpy_bad_done
 
 .Lmemcpy_bad2_loop16:
 #ifdef __ARMEB__
 	mov	r4, ip, lsl #16
 #else
 	mov	r4, ip, lsr #16
 #endif
 	ldr	r5, [r1], #0x04
 	pld	[r1, #0x018]
 	ldr	r6, [r1], #0x04
 	ldr	r7, [r1], #0x04
 	ldr	ip, [r1], #0x04
 #ifdef __ARMEB__
 	orr	r4, r4, r5, lsr #16
 	mov	r5, r5, lsl #16
 	orr	r5, r5, r6, lsr #16
 	mov	r6, r6, lsl #16
 	orr	r6, r6, r7, lsr #16
 	mov	r7, r7, lsl #16
 	orr	r7, r7, ip, lsr #16
 #else
 	orr	r4, r4, r5, lsl #16
 	mov	r5, r5, lsr #16
 	orr	r5, r5, r6, lsl #16
 	mov	r6, r6, lsr #16
 	orr	r6, r6, r7, lsl #16
 	mov	r7, r7, lsr #16
 	orr	r7, r7, ip, lsl #16
 #endif
 	str	r4, [r3], #0x04
 	str	r5, [r3], #0x04
 	str	r6, [r3], #0x04
 	str	r7, [r3], #0x04
 .Lmemcpy_bad2:
 	subs	r2, r2, #0x10         
 	bge	.Lmemcpy_bad2_loop16
 
 	adds	r2, r2, #0x10         
 	ldmeqfd	sp!, {r4-r7}
 	RETeq			/* Return now if done */
 	subs	r2, r2, #0x04
 	sublt	r1, r1, #0x02
 	blt	.Lmemcpy_bad_done
 
 .Lmemcpy_bad2_loop4:
 #ifdef __ARMEB__
 	mov	r4, ip, lsl #16
 #else
 	mov	r4, ip, lsr #16
 #endif
 	ldr	ip, [r1], #0x04
 	subs	r2, r2, #0x04
 #ifdef __ARMEB__
 	orr	r4, r4, ip, lsr #16
 #else
 	orr	r4, r4, ip, lsl #16
 #endif
 	str	r4, [r3], #0x04
 	bge	.Lmemcpy_bad2_loop4
 	sub	r1, r1, #0x02
 	b	.Lmemcpy_bad_done
 
 .Lmemcpy_bad3_loop16:
 #ifdef __ARMEB__
 	mov	r4, ip, lsl #24
 #else
 	mov	r4, ip, lsr #24
 #endif
 	ldr	r5, [r1], #0x04
 	pld	[r1, #0x018]
 	ldr	r6, [r1], #0x04
 	ldr	r7, [r1], #0x04
 	ldr	ip, [r1], #0x04
 #ifdef __ARMEB__
 	orr	r4, r4, r5, lsr #8
 	mov	r5, r5, lsl #24
 	orr	r5, r5, r6, lsr #8
 	mov	r6, r6, lsl #24
 	orr	r6, r6, r7, lsr #8
 	mov	r7, r7, lsl #24
 	orr	r7, r7, ip, lsr #8
 #else
 	orr	r4, r4, r5, lsl #8
 	mov	r5, r5, lsr #24
 	orr	r5, r5, r6, lsl #8
 	mov	r6, r6, lsr #24
 	orr	r6, r6, r7, lsl #8
 	mov	r7, r7, lsr #24
 	orr	r7, r7, ip, lsl #8
 #endif
 	str	r4, [r3], #0x04
 	str	r5, [r3], #0x04
 	str	r6, [r3], #0x04
 	str	r7, [r3], #0x04
 .Lmemcpy_bad3:
 	subs	r2, r2, #0x10         
 	bge	.Lmemcpy_bad3_loop16
 
 	adds	r2, r2, #0x10         
 	ldmeqfd	sp!, {r4-r7}
 	RETeq			/* Return now if done */
 	subs	r2, r2, #0x04
 	sublt	r1, r1, #0x01
 	blt	.Lmemcpy_bad_done
 
 .Lmemcpy_bad3_loop4:
 #ifdef __ARMEB__
 	mov	r4, ip, lsl #24
 #else
 	mov	r4, ip, lsr #24
 #endif
 	ldr	ip, [r1], #0x04
 	subs	r2, r2, #0x04
 #ifdef __ARMEB__
 	orr	r4, r4, ip, lsr #8
 #else
 	orr	r4, r4, ip, lsl #8
 #endif
 	str	r4, [r3], #0x04
 	bge	.Lmemcpy_bad3_loop4
 	sub	r1, r1, #0x01
 
 .Lmemcpy_bad_done:
 	ldmfd	sp!, {r4-r7}
 	adds	r2, r2, #0x04
 	RETeq
 	ldrb	ip, [r1], #0x01
 	cmp	r2, #0x02
 	ldrgeb	r2, [r1], #0x01
 	strb	ip, [r3], #0x01
 	ldrgtb	ip, [r1]
 	strgeb	r2, [r3], #0x01
 	strgtb	ip, [r3]
 	RET
 
 
 /*
  * Handle short copies (less than 16 bytes), possibly misaligned.
  * Some of these are *very* common, thanks to the network stack,
  * and so are handled specially.
  */
 .Lmemcpy_short:
 	add	pc, pc, r2, lsl #2
 	nop
 	RET			/* 0x00 */
 	b	.Lmemcpy_bytewise	/* 0x01 */
 	b	.Lmemcpy_bytewise	/* 0x02 */
 	b	.Lmemcpy_bytewise	/* 0x03 */
 	b	.Lmemcpy_4		/* 0x04 */
 	b	.Lmemcpy_bytewise	/* 0x05 */
 	b	.Lmemcpy_6		/* 0x06 */
 	b	.Lmemcpy_bytewise	/* 0x07 */
 	b	.Lmemcpy_8		/* 0x08 */
 	b	.Lmemcpy_bytewise	/* 0x09 */
 	b	.Lmemcpy_bytewise	/* 0x0a */
 	b	.Lmemcpy_bytewise	/* 0x0b */
 	b	.Lmemcpy_c		/* 0x0c */
 .Lmemcpy_bytewise:
 	mov	r3, r0			/* We must not clobber r0 */
 	ldrb	ip, [r1], #0x01
 1:	subs	r2, r2, #0x01
 	strb	ip, [r3], #0x01
 	ldrneb	ip, [r1], #0x01
 	bne	1b
 	RET
 
 /******************************************************************************
  * Special case for 4 byte copies
  */
 #define	LMEMCPY_4_LOG2	6	/* 64 bytes */
 #define	LMEMCPY_4_PAD	.align LMEMCPY_4_LOG2
 	LMEMCPY_4_PAD
 .Lmemcpy_4:
 	and	r2, r1, #0x03
 	orr	r2, r2, r0, lsl #2
 	ands	r2, r2, #0x0f
 	sub	r3, pc, #0x14
 	addne	pc, r3, r2, lsl #LMEMCPY_4_LOG2
 
 /*
  * 0000: dst is 32-bit aligned, src is 32-bit aligned
  */
 	ldr	r2, [r1]
 	str	r2, [r0]
 	RET
 	LMEMCPY_4_PAD
 
 /*
  * 0001: dst is 32-bit aligned, src is 8-bit aligned
  */
 	ldr	r3, [r1, #-1]		/* BE:r3 = x012  LE:r3 = 210x */
 	ldr	r2, [r1, #3]		/* BE:r2 = 3xxx  LE:r2 = xxx3 */
 #ifdef __ARMEB__
 	mov	r3, r3, lsl #8		/* r3 = 012. */
 	orr	r3, r3, r2, lsr #24	/* r3 = 0123 */
 #else
 	mov	r3, r3, lsr #8		/* r3 = .210 */
 	orr	r3, r3, r2, lsl #24	/* r3 = 3210 */
 #endif
 	str	r3, [r0]
 	RET
 	LMEMCPY_4_PAD
 
 /*
  * 0010: dst is 32-bit aligned, src is 16-bit aligned
  */
 #ifdef __ARMEB__
 	ldrh	r3, [r1]
 	ldrh	r2, [r1, #0x02]
 #else
 	ldrh	r3, [r1, #0x02]
 	ldrh	r2, [r1]
 #endif
 	orr	r3, r2, r3, lsl #16
 	str	r3, [r0]
 	RET
 	LMEMCPY_4_PAD
 
 /*
  * 0011: dst is 32-bit aligned, src is 8-bit aligned
  */
 	ldr	r3, [r1, #-3]		/* BE:r3 = xxx0  LE:r3 = 0xxx */
 	ldr	r2, [r1, #1]		/* BE:r2 = 123x  LE:r2 = x321 */
 #ifdef __ARMEB__
 	mov	r3, r3, lsl #24		/* r3 = 0... */
 	orr	r3, r3, r2, lsr #8	/* r3 = 0123 */
 #else
 	mov	r3, r3, lsr #24		/* r3 = ...0 */
 	orr	r3, r3, r2, lsl #8	/* r3 = 3210 */
 #endif
 	str	r3, [r0]
 	RET
 	LMEMCPY_4_PAD
 
 /*
  * 0100: dst is 8-bit aligned, src is 32-bit aligned
  */
 	ldr	r2, [r1]
 #ifdef __ARMEB__
 	strb	r2, [r0, #0x03]
 	mov	r3, r2, lsr #8
 	mov	r1, r2, lsr #24
 	strb	r1, [r0]
 #else
 	strb	r2, [r0]
 	mov	r3, r2, lsr #8
 	mov	r1, r2, lsr #24
 	strb	r1, [r0, #0x03]
 #endif
 	strh	r3, [r0, #0x01]
 	RET
 	LMEMCPY_4_PAD
 
 /*
  * 0101: dst is 8-bit aligned, src is 8-bit aligned
  */
 	ldrb	r2, [r1]
 	ldrh	r3, [r1, #0x01]
 	ldrb	r1, [r1, #0x03]
 	strb	r2, [r0]
 	strh	r3, [r0, #0x01]
 	strb	r1, [r0, #0x03]
 	RET
 	LMEMCPY_4_PAD
 
 /*
  * 0110: dst is 8-bit aligned, src is 16-bit aligned
  */
 	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
 	ldrh	r3, [r1, #0x02]		/* LE:r3 = ..23  LE:r3 = ..32 */
 #ifdef __ARMEB__
 	mov	r1, r2, lsr #8		/* r1 = ...0 */
 	strb	r1, [r0]
 	mov	r2, r2, lsl #8		/* r2 = .01. */
 	orr	r2, r2, r3, lsr #8	/* r2 = .012 */
 #else
 	strb	r2, [r0]
 	mov	r2, r2, lsr #8		/* r2 = ...1 */
 	orr	r2, r2, r3, lsl #8	/* r2 = .321 */
 	mov	r3, r3, lsr #8		/* r3 = ...3 */
 #endif
 	strh	r2, [r0, #0x01]
 	strb	r3, [r0, #0x03]
 	RET
 	LMEMCPY_4_PAD
 
 /*
  * 0111: dst is 8-bit aligned, src is 8-bit aligned
  */
 	ldrb	r2, [r1]
 	ldrh	r3, [r1, #0x01]
 	ldrb	r1, [r1, #0x03]
 	strb	r2, [r0]
 	strh	r3, [r0, #0x01]
 	strb	r1, [r0, #0x03]
 	RET
 	LMEMCPY_4_PAD
 
 /*
  * 1000: dst is 16-bit aligned, src is 32-bit aligned
  */
 	ldr	r2, [r1]
 #ifdef __ARMEB__
 	strh	r2, [r0, #0x02]
 	mov	r3, r2, lsr #16
 	strh	r3, [r0]
 #else
 	strh	r2, [r0]
 	mov	r3, r2, lsr #16
 	strh	r3, [r0, #0x02]
 #endif
 	RET
 	LMEMCPY_4_PAD
 
 /*
  * 1001: dst is 16-bit aligned, src is 8-bit aligned
  */
 	ldr	r2, [r1, #-1]		/* BE:r2 = x012  LE:r2 = 210x */
 	ldr	r3, [r1, #3]		/* BE:r3 = 3xxx  LE:r3 = xxx3 */
 	mov	r1, r2, lsr #8		/* BE:r1 = .x01  LE:r1 = .210 */
 	strh	r1, [r0]
 #ifdef __ARMEB__
 	mov	r2, r2, lsl #8		/* r2 = 012. */
 	orr	r2, r2, r3, lsr #24	/* r2 = 0123 */
 #else
 	mov	r2, r2, lsr #24		/* r2 = ...2 */
 	orr	r2, r2, r3, lsl #8	/* r2 = xx32 */
 #endif
 	strh	r2, [r0, #0x02]
 	RET
 	LMEMCPY_4_PAD
 
 /*
  * 1010: dst is 16-bit aligned, src is 16-bit aligned
  */
 	ldrh	r2, [r1]
 	ldrh	r3, [r1, #0x02]
 	strh	r2, [r0]
 	strh	r3, [r0, #0x02]
 	RET
 	LMEMCPY_4_PAD
 
 /*
  * 1011: dst is 16-bit aligned, src is 8-bit aligned
  */
 	ldr	r3, [r1, #1]		/* BE:r3 = 123x  LE:r3 = x321 */
 	ldr	r2, [r1, #-3]		/* BE:r2 = xxx0  LE:r2 = 0xxx */
 	mov	r1, r3, lsr #8		/* BE:r1 = .123  LE:r1 = .x32 */
 	strh	r1, [r0, #0x02]
 #ifdef __ARMEB__
 	mov	r3, r3, lsr #24		/* r3 = ...1 */
 	orr	r3, r3, r2, lsl #8	/* r3 = xx01 */
 #else
 	mov	r3, r3, lsl #8		/* r3 = 321. */
 	orr	r3, r3, r2, lsr #24	/* r3 = 3210 */
 #endif
 	strh	r3, [r0]
 	RET
 	LMEMCPY_4_PAD
 
 /*
  * 1100: dst is 8-bit aligned, src is 32-bit aligned
  */
 	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
 #ifdef __ARMEB__
 	strb	r2, [r0, #0x03]
 	mov	r3, r2, lsr #8
 	mov	r1, r2, lsr #24
 	strh	r3, [r0, #0x01]
 	strb	r1, [r0]
 #else
 	strb	r2, [r0]
 	mov	r3, r2, lsr #8
 	mov	r1, r2, lsr #24
 	strh	r3, [r0, #0x01]
 	strb	r1, [r0, #0x03]
 #endif
 	RET
 	LMEMCPY_4_PAD
 
 /*
  * 1101: dst is 8-bit aligned, src is 8-bit aligned
  */
 	ldrb	r2, [r1]
 	ldrh	r3, [r1, #0x01]
 	ldrb	r1, [r1, #0x03]
 	strb	r2, [r0]
 	strh	r3, [r0, #0x01]
 	strb	r1, [r0, #0x03]
 	RET
 	LMEMCPY_4_PAD
 
 /*
  * 1110: dst is 8-bit aligned, src is 16-bit aligned
  */
 #ifdef __ARMEB__
 	ldrh	r3, [r1, #0x02]		/* BE:r3 = ..23  LE:r3 = ..32 */
 	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
 	strb	r3, [r0, #0x03]
 	mov	r3, r3, lsr #8		/* r3 = ...2 */
 	orr	r3, r3, r2, lsl #8	/* r3 = ..12 */
 	strh	r3, [r0, #0x01]
 	mov	r2, r2, lsr #8		/* r2 = ...0 */
 	strb	r2, [r0]
 #else
 	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
 	ldrh	r3, [r1, #0x02]		/* BE:r3 = ..23  LE:r3 = ..32 */
 	strb	r2, [r0]
 	mov	r2, r2, lsr #8		/* r2 = ...1 */
 	orr	r2, r2, r3, lsl #8	/* r2 = .321 */
 	strh	r2, [r0, #0x01]
 	mov	r3, r3, lsr #8		/* r3 = ...3 */
 	strb	r3, [r0, #0x03]
 #endif
 	RET
 	LMEMCPY_4_PAD
 
 /*
  * 1111: dst is 8-bit aligned, src is 8-bit aligned
  */
 	ldrb	r2, [r1]
 	ldrh	r3, [r1, #0x01]
 	ldrb	r1, [r1, #0x03]
 	strb	r2, [r0]
 	strh	r3, [r0, #0x01]
 	strb	r1, [r0, #0x03]
 	RET
 	LMEMCPY_4_PAD
 
 
 /******************************************************************************
  * Special case for 6 byte copies
  */
 #define	LMEMCPY_6_LOG2	6	/* 64 bytes */
 #define	LMEMCPY_6_PAD	.align LMEMCPY_6_LOG2
 	LMEMCPY_6_PAD
 .Lmemcpy_6:
 	and	r2, r1, #0x03
 	orr	r2, r2, r0, lsl #2
 	ands	r2, r2, #0x0f
 	sub	r3, pc, #0x14
 	addne	pc, r3, r2, lsl #LMEMCPY_6_LOG2
 
 /*
  * 0000: dst is 32-bit aligned, src is 32-bit aligned
  */
 	ldr	r2, [r1]
 	ldrh	r3, [r1, #0x04]
 	str	r2, [r0]
 	strh	r3, [r0, #0x04]
 	RET
 	LMEMCPY_6_PAD
 
 /*
  * 0001: dst is 32-bit aligned, src is 8-bit aligned
  */
 	ldr	r2, [r1, #-1]		/* BE:r2 = x012  LE:r2 = 210x */
 	ldr	r3, [r1, #0x03]		/* BE:r3 = 345x  LE:r3 = x543 */
 #ifdef __ARMEB__
 	mov	r2, r2, lsl #8		/* r2 = 012. */
 	orr	r2, r2, r3, lsr #24	/* r2 = 0123 */
 #else
 	mov	r2, r2, lsr #8		/* r2 = .210 */
 	orr	r2, r2, r3, lsl #24	/* r2 = 3210 */
 #endif
 	mov	r3, r3, lsr #8		/* BE:r3 = .345  LE:r3 = .x54 */
 	str	r2, [r0]
 	strh	r3, [r0, #0x04]
 	RET
 	LMEMCPY_6_PAD
 
 /*
  * 0010: dst is 32-bit aligned, src is 16-bit aligned
  */
 	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
 	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
 #ifdef __ARMEB__
 	mov	r1, r3, lsr #16		/* r1 = ..23 */
 	orr	r1, r1, r2, lsl #16	/* r1 = 0123 */
 	str	r1, [r0]
 	strh	r3, [r0, #0x04]
 #else
 	mov	r1, r3, lsr #16		/* r1 = ..54 */
 	orr	r2, r2, r3, lsl #16	/* r2 = 3210 */
 	str	r2, [r0]
 	strh	r1, [r0, #0x04]
 #endif
 	RET
 	LMEMCPY_6_PAD
 
 /*
  * 0011: dst is 32-bit aligned, src is 8-bit aligned
  */
 	ldr	r2, [r1, #-3]		/* BE:r2 = xxx0  LE:r2 = 0xxx */
 	ldr	r3, [r1, #1]		/* BE:r3 = 1234  LE:r3 = 4321 */
 	ldr	r1, [r1, #5]		/* BE:r1 = 5xxx  LE:r3 = xxx5 */
 #ifdef __ARMEB__
 	mov	r2, r2, lsl #24		/* r2 = 0... */
 	orr	r2, r2, r3, lsr #8	/* r2 = 0123 */
 	mov	r3, r3, lsl #8		/* r3 = 234. */
 	orr	r1, r3, r1, lsr #24	/* r1 = 2345 */
 #else
 	mov	r2, r2, lsr #24		/* r2 = ...0 */
 	orr	r2, r2, r3, lsl #8	/* r2 = 3210 */
 	mov	r1, r1, lsl #8		/* r1 = xx5. */
 	orr	r1, r1, r3, lsr #24	/* r1 = xx54 */
 #endif
 	str	r2, [r0]
 	strh	r1, [r0, #0x04]
 	RET
 	LMEMCPY_6_PAD
 
 /*
  * 0100: dst is 8-bit aligned, src is 32-bit aligned
  */
 	ldr	r3, [r1]		/* BE:r3 = 0123  LE:r3 = 3210 */
 	ldrh	r2, [r1, #0x04]		/* BE:r2 = ..45  LE:r2 = ..54 */
 	mov	r1, r3, lsr #8		/* BE:r1 = .012  LE:r1 = .321 */
 	strh	r1, [r0, #0x01]
 #ifdef __ARMEB__
 	mov	r1, r3, lsr #24		/* r1 = ...0 */
 	strb	r1, [r0]
 	mov	r3, r3, lsl #8		/* r3 = 123. */
 	orr	r3, r3, r2, lsr #8	/* r3 = 1234 */
 #else
 	strb	r3, [r0]
 	mov	r3, r3, lsr #24		/* r3 = ...3 */
 	orr	r3, r3, r2, lsl #8	/* r3 = .543 */
 	mov	r2, r2, lsr #8		/* r2 = ...5 */
 #endif
 	strh	r3, [r0, #0x03]
 	strb	r2, [r0, #0x05]
 	RET
 	LMEMCPY_6_PAD
 
 /*
  * 0101: dst is 8-bit aligned, src is 8-bit aligned
  */
 	ldrb	r2, [r1]
 	ldrh	r3, [r1, #0x01]
 	ldrh	ip, [r1, #0x03]
 	ldrb	r1, [r1, #0x05]
 	strb	r2, [r0]
 	strh	r3, [r0, #0x01]
 	strh	ip, [r0, #0x03]
 	strb	r1, [r0, #0x05]
 	RET
 	LMEMCPY_6_PAD
 
 /*
  * 0110: dst is 8-bit aligned, src is 16-bit aligned
  */
 	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
 	ldr	r1, [r1, #0x02]		/* BE:r1 = 2345  LE:r1 = 5432 */
 #ifdef __ARMEB__
 	mov	r3, r2, lsr #8		/* r3 = ...0 */
 	strb	r3, [r0]
 	strb	r1, [r0, #0x05]
 	mov	r3, r1, lsr #8		/* r3 = .234 */
 	strh	r3, [r0, #0x03]
 	mov	r3, r2, lsl #8		/* r3 = .01. */
 	orr	r3, r3, r1, lsr #24	/* r3 = .012 */
 	strh	r3, [r0, #0x01]
 #else
 	strb	r2, [r0]
 	mov	r3, r1, lsr #24
 	strb	r3, [r0, #0x05]
 	mov	r3, r1, lsr #8		/* r3 = .543 */
 	strh	r3, [r0, #0x03]
 	mov	r3, r2, lsr #8		/* r3 = ...1 */
 	orr	r3, r3, r1, lsl #8	/* r3 = 4321 */
 	strh	r3, [r0, #0x01]
 #endif
 	RET
 	LMEMCPY_6_PAD
 
 /*
  * 0111: dst is 8-bit aligned, src is 8-bit aligned
  */
 	ldrb	r2, [r1]
 	ldrh	r3, [r1, #0x01]
 	ldrh	ip, [r1, #0x03]
 	ldrb	r1, [r1, #0x05]
 	strb	r2, [r0]
 	strh	r3, [r0, #0x01]
 	strh	ip, [r0, #0x03]
 	strb	r1, [r0, #0x05]
 	RET
 	LMEMCPY_6_PAD
 
 /*
  * 1000: dst is 16-bit aligned, src is 32-bit aligned
  */
 #ifdef __ARMEB__
 	ldr	r2, [r1]		/* r2 = 0123 */
 	ldrh	r3, [r1, #0x04]		/* r3 = ..45 */
 	mov	r1, r2, lsr #16		/* r1 = ..01 */
 	orr	r3, r3, r2, lsl#16	/* r3 = 2345 */
 	strh	r1, [r0]
 	str	r3, [r0, #0x02]
 #else
 	ldrh	r2, [r1, #0x04]		/* r2 = ..54 */
 	ldr	r3, [r1]		/* r3 = 3210 */
 	mov	r2, r2, lsl #16		/* r2 = 54.. */
 	orr	r2, r2, r3, lsr #16	/* r2 = 5432 */
 	strh	r3, [r0]
 	str	r2, [r0, #0x02]
 #endif
 	RET
 	LMEMCPY_6_PAD
 
 /*
  * 1001: dst is 16-bit aligned, src is 8-bit aligned
  */
 	ldr	r3, [r1, #-1]		/* BE:r3 = x012  LE:r3 = 210x */
 	ldr	r2, [r1, #3]		/* BE:r2 = 345x  LE:r2 = x543 */
 	mov	r1, r3, lsr #8		/* BE:r1 = .x01  LE:r1 = .210 */
 #ifdef __ARMEB__
 	mov	r2, r2, lsr #8		/* r2 = .345 */
 	orr	r2, r2, r3, lsl #24	/* r2 = 2345 */
 #else
 	mov	r2, r2, lsl #8		/* r2 = 543. */
 	orr	r2, r2, r3, lsr #24	/* r2 = 5432 */
 #endif
 	strh	r1, [r0]
 	str	r2, [r0, #0x02]
 	RET
 	LMEMCPY_6_PAD
 
 /*
  * 1010: dst is 16-bit aligned, src is 16-bit aligned
  */
 	ldrh	r2, [r1]
 	ldr	r3, [r1, #0x02]
 	strh	r2, [r0]
 	str	r3, [r0, #0x02]
 	RET
 	LMEMCPY_6_PAD
 
 /*
  * 1011: dst is 16-bit aligned, src is 8-bit aligned
  */
 	ldrb	r3, [r1]		/* r3 = ...0 */
 	ldr	r2, [r1, #0x01]		/* BE:r2 = 1234  LE:r2 = 4321 */
 	ldrb	r1, [r1, #0x05]		/* r1 = ...5 */
 #ifdef __ARMEB__
 	mov	r3, r3, lsl #8		/* r3 = ..0. */
 	orr	r3, r3, r2, lsr #24	/* r3 = ..01 */
 	orr	r1, r1, r2, lsl #8	/* r1 = 2345 */
 #else
 	orr	r3, r3, r2, lsl #8	/* r3 = 3210 */
 	mov	r1, r1, lsl #24		/* r1 = 5... */
 	orr	r1, r1, r2, lsr #8	/* r1 = 5432 */
 #endif
 	strh	r3, [r0]
 	str	r1, [r0, #0x02]
 	RET
 	LMEMCPY_6_PAD
 
 /*
  * 1100: dst is 8-bit aligned, src is 32-bit aligned
  */
 	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
 	ldrh	r1, [r1, #0x04]		/* BE:r1 = ..45  LE:r1 = ..54 */
 #ifdef __ARMEB__
 	mov	r3, r2, lsr #24		/* r3 = ...0 */
 	strb	r3, [r0]
 	mov	r2, r2, lsl #8		/* r2 = 123. */
 	orr	r2, r2, r1, lsr #8	/* r2 = 1234 */
 #else
 	strb	r2, [r0]
 	mov	r2, r2, lsr #8		/* r2 = .321 */
 	orr	r2, r2, r1, lsl #24	/* r2 = 4321 */
 	mov	r1, r1, lsr #8		/* r1 = ...5 */
 #endif
 	str	r2, [r0, #0x01]
 	strb	r1, [r0, #0x05]
 	RET
 	LMEMCPY_6_PAD
 
 /*
  * 1101: dst is 8-bit aligned, src is 8-bit aligned
  */
 	ldrb	r2, [r1]
 	ldrh	r3, [r1, #0x01]
 	ldrh	ip, [r1, #0x03]
 	ldrb	r1, [r1, #0x05]
 	strb	r2, [r0]
 	strh	r3, [r0, #0x01]
 	strh	ip, [r0, #0x03]
 	strb	r1, [r0, #0x05]
 	RET
 	LMEMCPY_6_PAD
 
 /*
  * 1110: dst is 8-bit aligned, src is 16-bit aligned
  */
 	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
 	ldr	r1, [r1, #0x02]		/* BE:r1 = 2345  LE:r1 = 5432 */
 #ifdef __ARMEB__
 	mov	r3, r2, lsr #8		/* r3 = ...0 */
 	strb	r3, [r0]
 	mov	r2, r2, lsl #24		/* r2 = 1... */
 	orr	r2, r2, r1, lsr #8	/* r2 = 1234 */
 #else
 	strb	r2, [r0]
 	mov	r2, r2, lsr #8		/* r2 = ...1 */
 	orr	r2, r2, r1, lsl #8	/* r2 = 4321 */
 	mov	r1, r1, lsr #24		/* r1 = ...5 */
 #endif
 	str	r2, [r0, #0x01]
 	strb	r1, [r0, #0x05]
 	RET
 	LMEMCPY_6_PAD
 
 /*
  * 1111: dst is 8-bit aligned, src is 8-bit aligned
  */
 	ldrb	r2, [r1]
 	ldr	r3, [r1, #0x01]
 	ldrb	r1, [r1, #0x05]
 	strb	r2, [r0]
 	str	r3, [r0, #0x01]
 	strb	r1, [r0, #0x05]
 	RET
 	LMEMCPY_6_PAD
 
 
 /******************************************************************************
  * Special case for 8 byte copies
  */
 #define	LMEMCPY_8_LOG2	6	/* 64 bytes */
 #define	LMEMCPY_8_PAD	.align LMEMCPY_8_LOG2
 	LMEMCPY_8_PAD
 .Lmemcpy_8:
 	and	r2, r1, #0x03
 	orr	r2, r2, r0, lsl #2
 	ands	r2, r2, #0x0f
 	sub	r3, pc, #0x14
 	addne	pc, r3, r2, lsl #LMEMCPY_8_LOG2
 
 /*
  * 0000: dst is 32-bit aligned, src is 32-bit aligned
  */
 	ldr	r2, [r1]
 	ldr	r3, [r1, #0x04]
 	str	r2, [r0]
 	str	r3, [r0, #0x04]
 	RET
 	LMEMCPY_8_PAD
 
 /*
  * 0001: dst is 32-bit aligned, src is 8-bit aligned
  */
 	ldr	r3, [r1, #-1]		/* BE:r3 = x012  LE:r3 = 210x */
 	ldr	r2, [r1, #0x03]		/* BE:r2 = 3456  LE:r2 = 6543 */
 	ldrb	r1, [r1, #0x07]		/* r1 = ...7 */
 #ifdef __ARMEB__
 	mov	r3, r3, lsl #8		/* r3 = 012. */
 	orr	r3, r3, r2, lsr #24	/* r3 = 0123 */
 	orr	r2, r1, r2, lsl #8	/* r2 = 4567 */
 #else
 	mov	r3, r3, lsr #8		/* r3 = .210 */
 	orr	r3, r3, r2, lsl #24	/* r3 = 3210 */
 	mov	r1, r1, lsl #24		/* r1 = 7... */
 	orr	r2, r1, r2, lsr #8	/* r2 = 7654 */
 #endif
 	str	r3, [r0]
 	str	r2, [r0, #0x04]
 	RET
 	LMEMCPY_8_PAD
 
 /*
  * 0010: dst is 32-bit aligned, src is 16-bit aligned
  */
 	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
 	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
 	ldrh	r1, [r1, #0x06]		/* BE:r1 = ..67  LE:r1 = ..76 */
 #ifdef __ARMEB__
 	mov	r2, r2, lsl #16		/* r2 = 01.. */
 	orr	r2, r2, r3, lsr #16	/* r2 = 0123 */
 	orr	r3, r1, r3, lsl #16	/* r3 = 4567 */
 #else
 	orr	r2, r2, r3, lsl #16	/* r2 = 3210 */
 	mov	r3, r3, lsr #16		/* r3 = ..54 */
 	orr	r3, r3, r1, lsl #16	/* r3 = 7654 */
 #endif
 	str	r2, [r0]
 	str	r3, [r0, #0x04]
 	RET
 	LMEMCPY_8_PAD
 
 /*
  * 0011: dst is 32-bit aligned, src is 8-bit aligned
  */
 	ldrb	r3, [r1]		/* r3 = ...0 */
 	ldr	r2, [r1, #0x01]		/* BE:r2 = 1234  LE:r2 = 4321 */
 	ldr	r1, [r1, #0x05]		/* BE:r1 = 567x  LE:r1 = x765 */
 #ifdef __ARMEB__
 	mov	r3, r3, lsl #24		/* r3 = 0... */
 	orr	r3, r3, r2, lsr #8	/* r3 = 0123 */
 	mov	r2, r2, lsl #24		/* r2 = 4... */
 	orr	r2, r2, r1, lsr #8	/* r2 = 4567 */
 #else
 	orr	r3, r3, r2, lsl #8	/* r3 = 3210 */
 	mov	r2, r2, lsr #24		/* r2 = ...4 */
 	orr	r2, r2, r1, lsl #8	/* r2 = 7654 */
 #endif
 	str	r3, [r0]
 	str	r2, [r0, #0x04]
 	RET
 	LMEMCPY_8_PAD
 
 /*
  * 0100: dst is 8-bit aligned, src is 32-bit aligned
  */
 	ldr	r3, [r1]		/* BE:r3 = 0123  LE:r3 = 3210 */
 	ldr	r2, [r1, #0x04]		/* BE:r2 = 4567  LE:r2 = 7654 */
 #ifdef __ARMEB__
 	mov	r1, r3, lsr #24		/* r1 = ...0 */
 	strb	r1, [r0]
 	mov	r1, r3, lsr #8		/* r1 = .012 */
 	strb	r2, [r0, #0x07]
 	mov	r3, r3, lsl #24		/* r3 = 3... */
 	orr	r3, r3, r2, lsr #8	/* r3 = 3456 */
 #else
 	strb	r3, [r0]
 	mov	r1, r2, lsr #24		/* r1 = ...7 */
 	strb	r1, [r0, #0x07]
 	mov	r1, r3, lsr #8		/* r1 = .321 */
 	mov	r3, r3, lsr #24		/* r3 = ...3 */
 	orr	r3, r3, r2, lsl #8	/* r3 = 6543 */
 #endif
 	strh	r1, [r0, #0x01]
 	str	r3, [r0, #0x03]
 	RET
 	LMEMCPY_8_PAD
 
 /*
  * 0101: dst is 8-bit aligned, src is 8-bit aligned
  */
 	ldrb	r2, [r1]
 	ldrh	r3, [r1, #0x01]
 	ldr	ip, [r1, #0x03]
 	ldrb	r1, [r1, #0x07]
 	strb	r2, [r0]
 	strh	r3, [r0, #0x01]
 	str	ip, [r0, #0x03]
 	strb	r1, [r0, #0x07]
 	RET
 	LMEMCPY_8_PAD
 
 /*
  * 0110: dst is 8-bit aligned, src is 16-bit aligned
  */
 	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
 	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
 	ldrh	r1, [r1, #0x06]		/* BE:r1 = ..67  LE:r1 = ..76 */
 #ifdef __ARMEB__
 	mov	ip, r2, lsr #8		/* ip = ...0 */
 	strb	ip, [r0]
 	mov	ip, r2, lsl #8		/* ip = .01. */
 	orr	ip, ip, r3, lsr #24	/* ip = .012 */
 	strb	r1, [r0, #0x07]
 	mov	r3, r3, lsl #8		/* r3 = 345. */
 	orr	r3, r3, r1, lsr #8	/* r3 = 3456 */
 #else
 	strb	r2, [r0]		/* 0 */
 	mov	ip, r1, lsr #8		/* ip = ...7 */
 	strb	ip, [r0, #0x07]		/* 7 */
 	mov	ip, r2, lsr #8		/* ip = ...1 */
 	orr	ip, ip, r3, lsl #8	/* ip = 4321 */
 	mov	r3, r3, lsr #8		/* r3 = .543 */
 	orr	r3, r3, r1, lsl #24	/* r3 = 6543 */
 #endif
 	strh	ip, [r0, #0x01]
 	str	r3, [r0, #0x03]
 	RET
 	LMEMCPY_8_PAD
 
 /*
  * 0111: dst is 8-bit aligned, src is 8-bit aligned
  */
 	ldrb	r3, [r1]		/* r3 = ...0 */
 	ldr	ip, [r1, #0x01]		/* BE:ip = 1234  LE:ip = 4321 */
 	ldrh	r2, [r1, #0x05]		/* BE:r2 = ..56  LE:r2 = ..65 */
 	ldrb	r1, [r1, #0x07]		/* r1 = ...7 */
 	strb	r3, [r0]
 	mov	r3, ip, lsr #16		/* BE:r3 = ..12  LE:r3 = ..43 */
 #ifdef __ARMEB__
 	strh	r3, [r0, #0x01]
 	orr	r2, r2, ip, lsl #16	/* r2 = 3456 */
 #else
 	strh	ip, [r0, #0x01]
 	orr	r2, r3, r2, lsl #16	/* r2 = 6543 */
 #endif
 	str	r2, [r0, #0x03]
 	strb	r1, [r0, #0x07]
 	RET
 	LMEMCPY_8_PAD
 
 /*
  * 1000: dst is 16-bit aligned, src is 32-bit aligned
  */
 	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
 	ldr	r3, [r1, #0x04]		/* BE:r3 = 4567  LE:r3 = 7654 */
 	mov	r1, r2, lsr #16		/* BE:r1 = ..01  LE:r1 = ..32 */
 #ifdef __ARMEB__
 	strh	r1, [r0]
 	mov	r1, r3, lsr #16		/* r1 = ..45 */
 	orr	r2, r1 ,r2, lsl #16	/* r2 = 2345 */
 #else
 	strh	r2, [r0]
 	orr	r2, r1, r3, lsl #16	/* r2 = 5432 */
 	mov	r3, r3, lsr #16		/* r3 = ..76 */
 #endif
 	str	r2, [r0, #0x02]
 	strh	r3, [r0, #0x06]
 	RET
 	LMEMCPY_8_PAD
 
 /*
  * 1001: dst is 16-bit aligned, src is 8-bit aligned
  */
 	ldr	r2, [r1, #-1]		/* BE:r2 = x012  LE:r2 = 210x */
 	ldr	r3, [r1, #0x03]		/* BE:r3 = 3456  LE:r3 = 6543 */
 	ldrb	ip, [r1, #0x07]		/* ip = ...7 */
 	mov	r1, r2, lsr #8		/* BE:r1 = .x01  LE:r1 = .210 */
 	strh	r1, [r0]
 #ifdef __ARMEB__
 	mov	r1, r2, lsl #24		/* r1 = 2... */
 	orr	r1, r1, r3, lsr #8	/* r1 = 2345 */
 	orr	r3, ip, r3, lsl #8	/* r3 = 4567 */
 #else
 	mov	r1, r2, lsr #24		/* r1 = ...2 */
 	orr	r1, r1, r3, lsl #8	/* r1 = 5432 */
 	mov	r3, r3, lsr #24		/* r3 = ...6 */
 	orr	r3, r3, ip, lsl #8	/* r3 = ..76 */
 #endif
 	str	r1, [r0, #0x02]
 	strh	r3, [r0, #0x06]
 	RET
 	LMEMCPY_8_PAD
 
 /*
  * 1010: dst is 16-bit aligned, src is 16-bit aligned
  */
 	ldrh	r2, [r1]
 	ldr	ip, [r1, #0x02]
 	ldrh	r3, [r1, #0x06]
 	strh	r2, [r0]
 	str	ip, [r0, #0x02]
 	strh	r3, [r0, #0x06]
 	RET
 	LMEMCPY_8_PAD
 
 /*
  * 1011: dst is 16-bit aligned, src is 8-bit aligned
  */
 	ldr	r3, [r1, #0x05]		/* BE:r3 = 567x  LE:r3 = x765 */
 	ldr	r2, [r1, #0x01]		/* BE:r2 = 1234  LE:r2 = 4321 */
 	ldrb	ip, [r1]		/* ip = ...0 */
 	mov	r1, r3, lsr #8		/* BE:r1 = .567  LE:r1 = .x76 */
 	strh	r1, [r0, #0x06]
 #ifdef __ARMEB__
 	mov	r3, r3, lsr #24		/* r3 = ...5 */
 	orr	r3, r3, r2, lsl #8	/* r3 = 2345 */
 	mov	r2, r2, lsr #24		/* r2 = ...1 */
 	orr	r2, r2, ip, lsl #8	/* r2 = ..01 */
 #else
 	mov	r3, r3, lsl #24		/* r3 = 5... */
 	orr	r3, r3, r2, lsr #8	/* r3 = 5432 */
 	orr	r2, ip, r2, lsl #8	/* r2 = 3210 */
 #endif
 	str	r3, [r0, #0x02]
 	strh	r2, [r0]
 	RET
 	LMEMCPY_8_PAD
 
 /*
  * 1100: dst is 8-bit aligned, src is 32-bit aligned
  */
 	ldr	r3, [r1, #0x04]		/* BE:r3 = 4567  LE:r3 = 7654 */
 	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
 	mov	r1, r3, lsr #8		/* BE:r1 = .456  LE:r1 = .765 */
 	strh	r1, [r0, #0x05]
 #ifdef __ARMEB__
 	strb	r3, [r0, #0x07]
 	mov	r1, r2, lsr #24		/* r1 = ...0 */
 	strb	r1, [r0]
 	mov	r2, r2, lsl #8		/* r2 = 123. */
 	orr	r2, r2, r3, lsr #24	/* r2 = 1234 */
 	str	r2, [r0, #0x01]
 #else
 	strb	r2, [r0]
 	mov	r1, r3, lsr #24		/* r1 = ...7 */
 	strb	r1, [r0, #0x07]
 	mov	r2, r2, lsr #8		/* r2 = .321 */
 	orr	r2, r2, r3, lsl #24	/* r2 = 4321 */
 	str	r2, [r0, #0x01]
 #endif
 	RET
 	LMEMCPY_8_PAD
 
 /*
  * 1101: dst is 8-bit aligned, src is 8-bit aligned
  */
 	ldrb	r3, [r1]		/* r3 = ...0 */
 	ldrh	r2, [r1, #0x01]		/* BE:r2 = ..12  LE:r2 = ..21 */
 	ldr	ip, [r1, #0x03]		/* BE:ip = 3456  LE:ip = 6543 */
 	ldrb	r1, [r1, #0x07]		/* r1 = ...7 */
 	strb	r3, [r0]
 	mov	r3, ip, lsr #16		/* BE:r3 = ..34  LE:r3 = ..65 */
 #ifdef __ARMEB__
 	strh	ip, [r0, #0x05]
 	orr	r2, r3, r2, lsl #16	/* r2 = 1234 */
 #else
 	strh	r3, [r0, #0x05]
 	orr	r2, r2, ip, lsl #16	/* r2 = 4321 */
 #endif
 	str	r2, [r0, #0x01]
 	strb	r1, [r0, #0x07]
 	RET
 	LMEMCPY_8_PAD
 
 /*
  * 1110: dst is 8-bit aligned, src is 16-bit aligned
  */
 	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
 	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
 	ldrh	r1, [r1, #0x06]		/* BE:r1 = ..67  LE:r1 = ..76 */
 #ifdef __ARMEB__
 	mov	ip, r2, lsr #8		/* ip = ...0 */
 	strb	ip, [r0]
 	mov	ip, r2, lsl #24		/* ip = 1... */
 	orr	ip, ip, r3, lsr #8	/* ip = 1234 */
 	strb	r1, [r0, #0x07]
 	mov	r1, r1, lsr #8		/* r1 = ...6 */
 	orr	r1, r1, r3, lsl #8	/* r1 = 3456 */
 #else
 	strb	r2, [r0]
 	mov	ip, r2, lsr #8		/* ip = ...1 */
 	orr	ip, ip, r3, lsl #8	/* ip = 4321 */
 	mov	r2, r1, lsr #8		/* r2 = ...7 */
 	strb	r2, [r0, #0x07]
 	mov	r1, r1, lsl #8		/* r1 = .76. */
 	orr	r1, r1, r3, lsr #24	/* r1 = .765 */
 #endif
 	str	ip, [r0, #0x01]
 	strh	r1, [r0, #0x05]
 	RET
 	LMEMCPY_8_PAD
 
 /*
  * 1111: dst is 8-bit aligned, src is 8-bit aligned
  */
 	ldrb	r2, [r1]
 	ldr	ip, [r1, #0x01]
 	ldrh	r3, [r1, #0x05]
 	ldrb	r1, [r1, #0x07]
 	strb	r2, [r0]
 	str	ip, [r0, #0x01]
 	strh	r3, [r0, #0x05]
 	strb	r1, [r0, #0x07]
 	RET
 	LMEMCPY_8_PAD
 
 /******************************************************************************
  * Special case for 12 byte copies
  */
 #define	LMEMCPY_C_LOG2	7	/* 128 bytes */
 #define	LMEMCPY_C_PAD	.align LMEMCPY_C_LOG2
 	LMEMCPY_C_PAD
 .Lmemcpy_c:
 	and	r2, r1, #0x03
 	orr	r2, r2, r0, lsl #2
 	ands	r2, r2, #0x0f
 	sub	r3, pc, #0x14
 	addne	pc, r3, r2, lsl #LMEMCPY_C_LOG2
 
 /*
  * 0000: dst is 32-bit aligned, src is 32-bit aligned
  */
 	ldr	r2, [r1]
 	ldr	r3, [r1, #0x04]
 	ldr	r1, [r1, #0x08]
 	str	r2, [r0]
 	str	r3, [r0, #0x04]
 	str	r1, [r0, #0x08]
 	RET
 	LMEMCPY_C_PAD
 
 /*
  * 0001: dst is 32-bit aligned, src is 8-bit aligned
  */
 	ldrb	r2, [r1, #0xb]		/* r2 = ...B */
 	ldr	ip, [r1, #0x07]		/* BE:ip = 789A  LE:ip = A987 */
 	ldr	r3, [r1, #0x03]		/* BE:r3 = 3456  LE:r3 = 6543 */
 	ldr	r1, [r1, #-1]		/* BE:r1 = x012  LE:r1 = 210x */
 #ifdef __ARMEB__
 	orr	r2, r2, ip, lsl #8	/* r2 = 89AB */
 	str	r2, [r0, #0x08]
 	mov	r2, ip, lsr #24		/* r2 = ...7 */
 	orr	r2, r2, r3, lsl #8	/* r2 = 4567 */
 	mov	r1, r1, lsl #8		/* r1 = 012. */
 	orr	r1, r1, r3, lsr #24	/* r1 = 0123 */
 #else
 	mov	r2, r2, lsl #24		/* r2 = B... */
 	orr	r2, r2, ip, lsr #8	/* r2 = BA98 */
 	str	r2, [r0, #0x08]
 	mov	r2, ip, lsl #24		/* r2 = 7... */
 	orr	r2, r2, r3, lsr #8	/* r2 = 7654 */
 	mov	r1, r1, lsr #8		/* r1 = .210 */
 	orr	r1, r1, r3, lsl #24	/* r1 = 3210 */
 #endif
 	str	r2, [r0, #0x04]
 	str	r1, [r0]
 	RET
 	LMEMCPY_C_PAD
 
 /*
  * 0010: dst is 32-bit aligned, src is 16-bit aligned
  */
 	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
 	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
 	ldr	ip, [r1, #0x06]		/* BE:ip = 6789  LE:ip = 9876 */
 	ldrh	r1, [r1, #0x0a]		/* BE:r1 = ..AB  LE:r1 = ..BA */
 #ifdef __ARMEB__
 	mov	r2, r2, lsl #16		/* r2 = 01.. */
 	orr	r2, r2, r3, lsr #16	/* r2 = 0123 */
 	str	r2, [r0]
 	mov	r3, r3, lsl #16		/* r3 = 45.. */
 	orr	r3, r3, ip, lsr #16	/* r3 = 4567 */
 	orr	r1, r1, ip, lsl #16	/* r1 = 89AB */
 #else
 	orr	r2, r2, r3, lsl #16	/* r2 = 3210 */
 	str	r2, [r0]
 	mov	r3, r3, lsr #16		/* r3 = ..54 */
 	orr	r3, r3, ip, lsl #16	/* r3 = 7654 */
 	mov	r1, r1, lsl #16		/* r1 = BA.. */
 	orr	r1, r1, ip, lsr #16	/* r1 = BA98 */
 #endif
 	str	r3, [r0, #0x04]
 	str	r1, [r0, #0x08]
 	RET
 	LMEMCPY_C_PAD
 
 /*
  * 0011: dst is 32-bit aligned, src is 8-bit aligned
  */
 	ldrb	r2, [r1]		/* r2 = ...0 */
 	ldr	r3, [r1, #0x01]		/* BE:r3 = 1234  LE:r3 = 4321 */
 	ldr	ip, [r1, #0x05]		/* BE:ip = 5678  LE:ip = 8765 */
 	ldr	r1, [r1, #0x09]		/* BE:r1 = 9ABx  LE:r1 = xBA9 */
 #ifdef __ARMEB__
 	mov	r2, r2, lsl #24		/* r2 = 0... */
 	orr	r2, r2, r3, lsr #8	/* r2 = 0123 */
 	str	r2, [r0]
 	mov	r3, r3, lsl #24		/* r3 = 4... */
 	orr	r3, r3, ip, lsr #8	/* r3 = 4567 */
 	mov	r1, r1, lsr #8		/* r1 = .9AB */
 	orr	r1, r1, ip, lsl #24	/* r1 = 89AB */
 #else
 	orr	r2, r2, r3, lsl #8	/* r2 = 3210 */
 	str	r2, [r0]
 	mov	r3, r3, lsr #24		/* r3 = ...4 */
 	orr	r3, r3, ip, lsl #8	/* r3 = 7654 */
 	mov	r1, r1, lsl #8		/* r1 = BA9. */
 	orr	r1, r1, ip, lsr #24	/* r1 = BA98 */
 #endif
 	str	r3, [r0, #0x04]
 	str	r1, [r0, #0x08]
 	RET
 	LMEMCPY_C_PAD
 
 /*
  * 0100: dst is 8-bit aligned (byte 1), src is 32-bit aligned
  */
 	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
 	ldr	r3, [r1, #0x04]		/* BE:r3 = 4567  LE:r3 = 7654 */
 	ldr	ip, [r1, #0x08]		/* BE:ip = 89AB  LE:ip = BA98 */
 	mov	r1, r2, lsr #8		/* BE:r1 = .012  LE:r1 = .321 */
 	strh	r1, [r0, #0x01]
 #ifdef __ARMEB__
 	mov	r1, r2, lsr #24		/* r1 = ...0 */
 	strb	r1, [r0]
 	mov	r1, r2, lsl #24		/* r1 = 3... */
 	orr	r2, r1, r3, lsr #8	/* r1 = 3456 */
 	mov	r1, r3, lsl #24		/* r1 = 7... */
 	orr	r1, r1, ip, lsr #8	/* r1 = 789A */
 #else
 	strb	r2, [r0]
 	mov	r1, r2, lsr #24		/* r1 = ...3 */
 	orr	r2, r1, r3, lsl #8	/* r1 = 6543 */
 	mov	r1, r3, lsr #24		/* r1 = ...7 */
 	orr	r1, r1, ip, lsl #8	/* r1 = A987 */
 	mov	ip, ip, lsr #24		/* ip = ...B */
 #endif
 	str	r2, [r0, #0x03]
 	str	r1, [r0, #0x07]
 	strb	ip, [r0, #0x0b]
 	RET
 	LMEMCPY_C_PAD
 
 /*
  * 0101: dst is 8-bit aligned (byte 1), src is 8-bit aligned (byte 1)
  */
 	ldrb	r2, [r1]
 	ldrh	r3, [r1, #0x01]
 	ldr	ip, [r1, #0x03]
 	strb	r2, [r0]
 	ldr	r2, [r1, #0x07]
 	ldrb	r1, [r1, #0x0b]
 	strh	r3, [r0, #0x01]
 	str	ip, [r0, #0x03]
 	str	r2, [r0, #0x07]
 	strb	r1, [r0, #0x0b]
 	RET
 	LMEMCPY_C_PAD
 
 /*
  * 0110: dst is 8-bit aligned (byte 1), src is 16-bit aligned
  */
 	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
 	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
 	ldr	ip, [r1, #0x06]		/* BE:ip = 6789  LE:ip = 9876 */
 	ldrh	r1, [r1, #0x0a]		/* BE:r1 = ..AB  LE:r1 = ..BA */
 #ifdef __ARMEB__
 	mov	r2, r2, ror #8		/* r2 = 1..0 */
 	strb	r2, [r0]
 	mov	r2, r2, lsr #16		/* r2 = ..1. */
 	orr	r2, r2, r3, lsr #24	/* r2 = ..12 */
 	strh	r2, [r0, #0x01]
 	mov	r2, r3, lsl #8		/* r2 = 345. */
 	orr	r3, r2, ip, lsr #24	/* r3 = 3456 */
 	mov	r2, ip, lsl #8		/* r2 = 789. */
 	orr	r2, r2, r1, lsr #8	/* r2 = 789A */
 #else
 	strb	r2, [r0]
 	mov	r2, r2, lsr #8		/* r2 = ...1 */
 	orr	r2, r2, r3, lsl #8	/* r2 = 4321 */
 	strh	r2, [r0, #0x01]
 	mov	r2, r3, lsr #8		/* r2 = .543 */
 	orr	r3, r2, ip, lsl #24	/* r3 = 6543 */
 	mov	r2, ip, lsr #8		/* r2 = .987 */
 	orr	r2, r2, r1, lsl #24	/* r2 = A987 */
 	mov	r1, r1, lsr #8		/* r1 = ...B */
 #endif
 	str	r3, [r0, #0x03]
 	str	r2, [r0, #0x07]
 	strb	r1, [r0, #0x0b]
 	RET
 	LMEMCPY_C_PAD
 
 /*
  * 0111: dst is 8-bit aligned (byte 1), src is 8-bit aligned (byte 3)
  */
 	ldrb	r2, [r1]
 	ldr	r3, [r1, #0x01]		/* BE:r3 = 1234  LE:r3 = 4321 */
 	ldr	ip, [r1, #0x05]		/* BE:ip = 5678  LE:ip = 8765 */
 	ldr	r1, [r1, #0x09]		/* BE:r1 = 9ABx  LE:r1 = xBA9 */
 	strb	r2, [r0]
 #ifdef __ARMEB__
 	mov	r2, r3, lsr #16		/* r2 = ..12 */
 	strh	r2, [r0, #0x01]
 	mov	r3, r3, lsl #16		/* r3 = 34.. */
 	orr	r3, r3, ip, lsr #16	/* r3 = 3456 */
 	mov	ip, ip, lsl #16		/* ip = 78.. */
 	orr	ip, ip, r1, lsr #16	/* ip = 789A */
 	mov	r1, r1, lsr #8		/* r1 = .9AB */
 #else
 	strh	r3, [r0, #0x01]
 	mov	r3, r3, lsr #16		/* r3 = ..43 */
 	orr	r3, r3, ip, lsl #16	/* r3 = 6543 */
 	mov	ip, ip, lsr #16		/* ip = ..87 */
 	orr	ip, ip, r1, lsl #16	/* ip = A987 */
 	mov	r1, r1, lsr #16		/* r1 = ..xB */
 #endif
 	str	r3, [r0, #0x03]
 	str	ip, [r0, #0x07]
 	strb	r1, [r0, #0x0b]
 	RET
 	LMEMCPY_C_PAD
 
 /*
  * 1000: dst is 16-bit aligned, src is 32-bit aligned
  */
 	ldr	ip, [r1]		/* BE:ip = 0123  LE:ip = 3210 */
 	ldr	r3, [r1, #0x04]		/* BE:r3 = 4567  LE:r3 = 7654 */
 	ldr	r2, [r1, #0x08]		/* BE:r2 = 89AB  LE:r2 = BA98 */
 	mov	r1, ip, lsr #16		/* BE:r1 = ..01  LE:r1 = ..32 */
 #ifdef __ARMEB__
 	strh	r1, [r0]
 	mov	r1, ip, lsl #16		/* r1 = 23.. */
 	orr	r1, r1, r3, lsr #16	/* r1 = 2345 */
 	mov	r3, r3, lsl #16		/* r3 = 67.. */
 	orr	r3, r3, r2, lsr #16	/* r3 = 6789 */
 #else
 	strh	ip, [r0]
 	orr	r1, r1, r3, lsl #16	/* r1 = 5432 */
 	mov	r3, r3, lsr #16		/* r3 = ..76 */
 	orr	r3, r3, r2, lsl #16	/* r3 = 9876 */
 	mov	r2, r2, lsr #16		/* r2 = ..BA */
 #endif
 	str	r1, [r0, #0x02]
 	str	r3, [r0, #0x06]
 	strh	r2, [r0, #0x0a]
 	RET
 	LMEMCPY_C_PAD
 
 /*
  * 1001: dst is 16-bit aligned, src is 8-bit aligned (byte 1)
  */
 	ldr	r2, [r1, #-1]		/* BE:r2 = x012  LE:r2 = 210x */
 	ldr	r3, [r1, #0x03]		/* BE:r3 = 3456  LE:r3 = 6543 */
 	mov	ip, r2, lsr #8		/* BE:ip = .x01  LE:ip = .210 */
 	strh	ip, [r0]
 	ldr	ip, [r1, #0x07]		/* BE:ip = 789A  LE:ip = A987 */
 	ldrb	r1, [r1, #0x0b]		/* r1 = ...B */
 #ifdef __ARMEB__
 	mov	r2, r2, lsl #24		/* r2 = 2... */
 	orr	r2, r2, r3, lsr #8	/* r2 = 2345 */
 	mov	r3, r3, lsl #24		/* r3 = 6... */
 	orr	r3, r3, ip, lsr #8	/* r3 = 6789 */
 	orr	r1, r1, ip, lsl #8	/* r1 = 89AB */
 #else
 	mov	r2, r2, lsr #24		/* r2 = ...2 */
 	orr	r2, r2, r3, lsl #8	/* r2 = 5432 */
 	mov	r3, r3, lsr #24		/* r3 = ...6 */
 	orr	r3, r3, ip, lsl #8	/* r3 = 9876 */
 	mov	r1, r1, lsl #8		/* r1 = ..B. */
 	orr	r1, r1, ip, lsr #24	/* r1 = ..BA */
 #endif
 	str	r2, [r0, #0x02]
 	str	r3, [r0, #0x06]
 	strh	r1, [r0, #0x0a]
 	RET
 	LMEMCPY_C_PAD
 
 /*
  * 1010: dst is 16-bit aligned, src is 16-bit aligned
  */
 	ldrh	r2, [r1]
 	ldr	r3, [r1, #0x02]
 	ldr	ip, [r1, #0x06]
 	ldrh	r1, [r1, #0x0a]
 	strh	r2, [r0]
 	str	r3, [r0, #0x02]
 	str	ip, [r0, #0x06]
 	strh	r1, [r0, #0x0a]
 	RET
 	LMEMCPY_C_PAD
 
 /*
  * 1011: dst is 16-bit aligned, src is 8-bit aligned (byte 3)
  */
 	ldr	r2, [r1, #0x09]		/* BE:r2 = 9ABx  LE:r2 = xBA9 */
 	ldr	r3, [r1, #0x05]		/* BE:r3 = 5678  LE:r3 = 8765 */
 	mov	ip, r2, lsr #8		/* BE:ip = .9AB  LE:ip = .xBA */
 	strh	ip, [r0, #0x0a]
 	ldr	ip, [r1, #0x01]		/* BE:ip = 1234  LE:ip = 4321 */
 	ldrb	r1, [r1]		/* r1 = ...0 */
 #ifdef __ARMEB__
 	mov	r2, r2, lsr #24		/* r2 = ...9 */
 	orr	r2, r2, r3, lsl #8	/* r2 = 6789 */
 	mov	r3, r3, lsr #24		/* r3 = ...5 */
 	orr	r3, r3, ip, lsl #8	/* r3 = 2345 */
 	mov	r1, r1, lsl #8		/* r1 = ..0. */
 	orr	r1, r1, ip, lsr #24	/* r1 = ..01 */
 #else
 	mov	r2, r2, lsl #24		/* r2 = 9... */
 	orr	r2, r2, r3, lsr #8	/* r2 = 9876 */
 	mov	r3, r3, lsl #24		/* r3 = 5... */
 	orr	r3, r3, ip, lsr #8	/* r3 = 5432 */
 	orr	r1, r1, ip, lsl #8	/* r1 = 3210 */
 #endif
 	str	r2, [r0, #0x06]
 	str	r3, [r0, #0x02]
 	strh	r1, [r0]
 	RET
 	LMEMCPY_C_PAD
 
 /*
  * 1100: dst is 8-bit aligned (byte 3), src is 32-bit aligned
  */
 	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
 	ldr	ip, [r1, #0x04]		/* BE:ip = 4567  LE:ip = 7654 */
 	ldr	r1, [r1, #0x08]		/* BE:r1 = 89AB  LE:r1 = BA98 */
 #ifdef __ARMEB__
 	mov	r3, r2, lsr #24		/* r3 = ...0 */
 	strb	r3, [r0]
 	mov	r2, r2, lsl #8		/* r2 = 123. */
 	orr	r2, r2, ip, lsr #24	/* r2 = 1234 */
 	str	r2, [r0, #0x01]
 	mov	r2, ip, lsl #8		/* r2 = 567. */
 	orr	r2, r2, r1, lsr #24	/* r2 = 5678 */
 	str	r2, [r0, #0x05]
 	mov	r2, r1, lsr #8		/* r2 = ..9A */
 	strh	r2, [r0, #0x09]
 	strb	r1, [r0, #0x0b]
 #else
 	strb	r2, [r0]
 	mov	r3, r2, lsr #8		/* r3 = .321 */
 	orr	r3, r3, ip, lsl #24	/* r3 = 4321 */
 	str	r3, [r0, #0x01]
 	mov	r3, ip, lsr #8		/* r3 = .765 */
 	orr	r3, r3, r1, lsl #24	/* r3 = 8765 */
 	str	r3, [r0, #0x05]
 	mov	r1, r1, lsr #8		/* r1 = .BA9 */
 	strh	r1, [r0, #0x09]
 	mov	r1, r1, lsr #16		/* r1 = ...B */
 	strb	r1, [r0, #0x0b]
 #endif
 	RET
 	LMEMCPY_C_PAD
 
 /*
  * 1101: dst is 8-bit aligned (byte 3), src is 8-bit aligned (byte 1)
  */
 	ldrb	r2, [r1, #0x0b]		/* r2 = ...B */
 	ldr	r3, [r1, #0x07]		/* BE:r3 = 789A  LE:r3 = A987 */
 	ldr	ip, [r1, #0x03]		/* BE:ip = 3456  LE:ip = 6543 */
 	ldr	r1, [r1, #-1]		/* BE:r1 = x012  LE:r1 = 210x */
 	strb	r2, [r0, #0x0b]
 #ifdef __ARMEB__
 	strh	r3, [r0, #0x09]
 	mov	r3, r3, lsr #16		/* r3 = ..78 */
 	orr	r3, r3, ip, lsl #16	/* r3 = 5678 */
 	mov	ip, ip, lsr #16		/* ip = ..34 */
 	orr	ip, ip, r1, lsl #16	/* ip = 1234 */
 	mov	r1, r1, lsr #16		/* r1 = ..x0 */
 #else
 	mov	r2, r3, lsr #16		/* r2 = ..A9 */
 	strh	r2, [r0, #0x09]
 	mov	r3, r3, lsl #16		/* r3 = 87.. */
 	orr	r3, r3, ip, lsr #16	/* r3 = 8765 */
 	mov	ip, ip, lsl #16		/* ip = 43.. */
 	orr	ip, ip, r1, lsr #16	/* ip = 4321 */
 	mov	r1, r1, lsr #8		/* r1 = .210 */
 #endif
 	str	r3, [r0, #0x05]
 	str	ip, [r0, #0x01]
 	strb	r1, [r0]
 	RET
 	LMEMCPY_C_PAD
 
 /*
  * 1110: dst is 8-bit aligned (byte 3), src is 16-bit aligned
  */
 #ifdef __ARMEB__
 	ldrh	r2, [r1, #0x0a]		/* r2 = ..AB */
 	ldr	ip, [r1, #0x06]		/* ip = 6789 */
 	ldr	r3, [r1, #0x02]		/* r3 = 2345 */
 	ldrh	r1, [r1]		/* r1 = ..01 */
 	strb	r2, [r0, #0x0b]
 	mov	r2, r2, lsr #8		/* r2 = ...A */
 	orr	r2, r2, ip, lsl #8	/* r2 = 789A */
 	mov	ip, ip, lsr #8		/* ip = .678 */
 	orr	ip, ip, r3, lsl #24	/* ip = 5678 */
 	mov	r3, r3, lsr #8		/* r3 = .234 */
 	orr	r3, r3, r1, lsl #24	/* r3 = 1234 */
 	mov	r1, r1, lsr #8		/* r1 = ...0 */
 	strb	r1, [r0]
 	str	r3, [r0, #0x01]
 	str	ip, [r0, #0x05]
 	strh	r2, [r0, #0x09]
 #else
 	ldrh	r2, [r1]		/* r2 = ..10 */
 	ldr	r3, [r1, #0x02]		/* r3 = 5432 */
 	ldr	ip, [r1, #0x06]		/* ip = 9876 */
 	ldrh	r1, [r1, #0x0a]		/* r1 = ..BA */
 	strb	r2, [r0]
 	mov	r2, r2, lsr #8		/* r2 = ...1 */
 	orr	r2, r2, r3, lsl #8	/* r2 = 4321 */
 	mov	r3, r3, lsr #24		/* r3 = ...5 */
 	orr	r3, r3, ip, lsl #8	/* r3 = 8765 */
 	mov	ip, ip, lsr #24		/* ip = ...9 */
 	orr	ip, ip, r1, lsl #8	/* ip = .BA9 */
 	mov	r1, r1, lsr #8		/* r1 = ...B */
 	str	r2, [r0, #0x01]
 	str	r3, [r0, #0x05]
 	strh	ip, [r0, #0x09]
 	strb	r1, [r0, #0x0b]
 #endif
 	RET
 	LMEMCPY_C_PAD
 
 /*
  * 1111: dst is 8-bit aligned (byte 3), src is 8-bit aligned (byte 3)
  */
 	ldrb	r2, [r1]
 	ldr	r3, [r1, #0x01]
 	ldr	ip, [r1, #0x05]
 	strb	r2, [r0]
 	ldrh	r2, [r1, #0x09]
 	ldrb	r1, [r1, #0x0b]
 	str	r3, [r0, #0x01]
 	str	ip, [r0, #0x05]
 	strh	r2, [r0, #0x09]
 	strb	r1, [r0, #0x0b]
 	RET
-#endif /* __XSCALE__ */
+#endif /* _ARM_ARCH_5E */
 
 #ifdef GPROF
 
 ENTRY(user)
 	nop
 ENTRY(btrap)
 	nop
 ENTRY(etrap)
 	nop
 ENTRY(bintr)
 	nop
 ENTRY(eintr)
 	nop
 
 #endif
Index: head/sys/arm/arm/swtch.S
===================================================================
--- head/sys/arm/arm/swtch.S	(revision 172613)
+++ head/sys/arm/arm/swtch.S	(revision 172614)
@@ -1,491 +1,491 @@
 /*	$NetBSD: cpuswitch.S,v 1.41 2003/11/15 08:44:18 scw Exp $	*/
 
 /*-
  * Copyright 2003 Wasabi Systems, Inc.
  * All rights reserved.
  *
  * Written by Steve C. Woodford for Wasabi Systems, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *      This product includes software developed for the NetBSD Project by
  *      Wasabi Systems, Inc.
  * 4. The name of Wasabi Systems, Inc. may not be used to endorse
  *    or promote products derived from this software without specific prior
  *    written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 /*-
  * Copyright (c) 1994-1998 Mark Brinicombe.
  * Copyright (c) 1994 Brini.
  * All rights reserved.
  *
  * This code is derived from software written for Brini by Mark Brinicombe
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by Brini.
  * 4. The name of the company nor the name of the author may be used to
  *    endorse or promote products derived from this software without specific
  *    prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY BRINI ``AS IS'' AND ANY EXPRESS OR IMPLIED
  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL BRINI OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * RiscBSD kernel project
  *
  * cpuswitch.S
  *
  * cpu switching functions
  *
  * Created      : 15/10/94
  *
  */
 
 #include "assym.s"
 
 #include <machine/asm.h>
 #include <machine/asmacros.h>
 #include <machine/armreg.h>
 __FBSDID("$FreeBSD$");
 
 
 /*
  * New experimental definitions of IRQdisable and IRQenable
  * These keep FIQ's enabled since FIQ's are special.
  */
 
 #define DOMAIN_CLIENT	0x01
 #define IRQdisable \
 	mrs	r14, cpsr ; \
 	orr	r14, r14, #(I32_bit) ; \
 	msr	cpsr_c, r14 ; \
 
 #define IRQenable \
 	mrs	r14, cpsr ; \
 	bic	r14, r14, #(I32_bit) ; \
 	msr	cpsr_c, r14 ; \
 
 /*
  * These are used for switching the translation table/DACR.
  * Since the vector page can be invalid for a short time, we must
  * disable both regular IRQs *and* FIQs.
  *
  * XXX: This is not necessary if the vector table is relocated.
  */
 #define IRQdisableALL \
 	mrs	r14, cpsr ; \
 	orr	r14, r14, #(I32_bit | F32_bit) ; \
 	msr	cpsr_c, r14
 
 #define IRQenableALL \
 	mrs	r14, cpsr ; \
 	bic	r14, r14, #(I32_bit | F32_bit) ; \
 	msr	cpsr_c, r14
 	
 .Lcurpcb:
 	.word	_C_LABEL(__pcpu) + PC_CURPCB
 .Lcpufuncs:	
 	.word	_C_LABEL(cpufuncs)
 .Lblock_userspace_access:
 	.word	_C_LABEL(block_userspace_access)
 .Lcpu_do_powersave:
 	.word	_C_LABEL(cpu_do_powersave)
 .Lblocked_lock:
 	.word	_C_LABEL(blocked_lock)
 ENTRY(cpu_throw)
 	mov	r5, r1
 
 	/*
 	 * r5 = newtd
 	 */
 
 	ldr	r7, [r5, #(TD_PCB)]		/* r7 = new thread's PCB */
 
 	/* Switch to lwp0 context */
 
 	ldr	r9, .Lcpufuncs
 	mov	lr, pc
 	ldr	pc, [r9, #CF_IDCACHE_WBINV_ALL]
 	ldr	r0, [r7, #(PCB_PL1VEC)]
 	ldr	r1, [r7, #(PCB_DACR)]
 	/*
 	 * r0 = Pointer to L1 slot for vector_page (or NULL)
 	 * r1 = lwp0's DACR
 	 * r5 = lwp0
 	 * r6 = exit func
 	 * r7 = lwp0's PCB
 	 * r9 = cpufuncs
 	 */
 
 	/*
 	 * Ensure the vector table is accessible by fixing up lwp0's L1
 	 */
 	cmp	r0, #0			/* No need to fixup vector table? */
 	ldrne	r3, [r0]		/* But if yes, fetch current value */
 	ldrne	r2, [r7, #(PCB_L1VEC)]	/* Fetch new vector_page value */
 	mcr	p15, 0, r1, c3, c0, 0	/* Update DACR for lwp0's context */
 	cmpne	r3, r2			/* Stuffing the same value? */
 	strne	r2, [r0]		/* Store if not. */
 
 #ifdef PMAP_INCLUDE_PTE_SYNC
 	/*
 	 * Need to sync the cache to make sure that last store is
 	 * visible to the MMU.
 	 */
 	movne	r1, #4
 	movne	lr, pc
 	ldrne	pc, [r9, #CF_DCACHE_WB_RANGE]
 #endif /* PMAP_INCLUDE_PTE_SYNC */
 
 	/*
 	 * Note: We don't do the same optimisation as cpu_switch() with
 	 * respect to avoiding flushing the TLB if we're switching to
 	 * the same L1 since this process' VM space may be about to go
 	 * away, so we don't want *any* turds left in the TLB.
 	 */
 
 	/* Switch the memory to the new process */
 	ldr	r0, [r7, #(PCB_PAGEDIR)]
 	mov	lr, pc
 	ldr	pc, [r9, #CF_CONTEXT_SWITCH]
 
 	/* Restore all the save registers */
-#ifndef __XSCALE__
+#ifndef _ARM_ARCH_5E
 	add	r1, r7, #PCB_R8
 	ldmia	r1, {r8-r13}
 #else
 	ldr	r8, [r7, #(PCB_R8)]
 	ldr	r9, [r7, #(PCB_R9)]
 	ldr	r10, [r7, #(PCB_R10)]
 	ldr	r11, [r7, #(PCB_R11)]
 	ldr	r12, [r7, #(PCB_R12)]
 	ldr	r13, [r7, #(PCB_SP)]
 #endif
 
 	/* We have a new curthread now so make a note it */
 	ldr	r6, .Lcurthread
 	str	r5, [r6]
 
 	/* Set the new tp */
 	ldr	r6, [r5, #(TD_MD + MD_TP)]
 	mov	r5, #ARM_TP_ADDRESS
 	strt	r6, [r5]
 
 	/* Hook in a new pcb */
 	ldr	r6, .Lcurpcb
 	str	r7, [r6]
 
 	ldmfd	sp!, {r4-r7, pc}
 
 ENTRY(cpu_switch)
 	stmfd	sp!, {r4-r7, lr}
 	mov	r6, r2 /* Save the mutex */
 
 .Lswitch_resume:
 	/* rem: r0 = old lwp */
 	/* rem: interrupts are disabled */
 
 #ifdef MULTIPROCESSOR
 	/* XXX use curcpu() */
 	ldr	r2, .Lcpu_info_store
 	str	r2, [r6, #(L_CPU)]
 #endif
 
 	/* Process is now on a processor. */
 
 	/* We have a new curthread now so make a note it */
 	ldr	r7, .Lcurthread
 	str	r1, [r7]
 
 	/* Hook in a new pcb */
 	ldr	r7, .Lcurpcb
 	ldr	r2, [r1, #TD_PCB]
 	str	r2, [r7]
 
 	/* rem: r1 = new process */
 	/* rem: interrupts are enabled */
 
 	/* Stage two : Save old context */
 
 	/* Get the user structure for the old thread. */
 	ldr	r2, [r0, #(TD_PCB)]
 	mov	r4, r0 /* Save the old thread. */
 
 	/* Save all the registers in the old thread's pcb */
-#ifndef __XSCALE__
+#ifndef _ARM_ARCH_5E
 	add	r7, r2, #(PCB_R8)
 	stmia	r7, {r8-r13}
 #else
 	strd	r8, [r2, #(PCB_R8)]
 	strd	r10, [r2, #(PCB_R10)]
 	strd	r12, [r2, #(PCB_R12)]
 #endif
 									   
 	/*
 	 * NOTE: We can now use r8-r13 until it is time to restore
 	 * them for the new process.
 	 */
 	/* Store the old tp */
 	mov	r3, #ARM_TP_ADDRESS
 	ldrt	r9, [r3]
 	str	r9, [r0, #(TD_MD + MD_TP)]
 
 	/* Set the new tp */
 	ldr	r9, [r1, #(TD_MD + MD_TP)]
 	strt	r9, [r3]
 
 	/* Get the user structure for the new process in r9 */
 	ldr	r9, [r1, #(TD_PCB)]
 
 	/* r1 now free! */
 
         mrs	r3, cpsr
 	/*
 	 * We can do that, since 
 	 * PSR_SVC32_MODE|PSR_UND32_MODE == MSR_UND32_MODE
 	 */
 	orr	r8, r3, #(PSR_UND32_MODE)
         msr	cpsr_c, r8
 
 	str	sp, [r2, #(PCB_UND_SP)]
 
         msr	cpsr_c, r3		/* Restore the old mode */
 	/* rem: r8 = old PCB */
 	/* rem: r9 = new PCB */
 	/* rem: interrupts are enabled */
 
 	/* What else needs to be saved  Only FPA stuff when that is supported */
 
 	/* Third phase : restore saved context */
 
 	/* rem: r8 = old PCB */
 	/* rem: r9 = new PCB */
 	/* rem: interrupts are enabled */
 
 	ldr	r5, [r9, #(PCB_DACR)]		/* r5 = new DACR */
 	mov	r2, #DOMAIN_CLIENT
 	cmp     r5, r2, lsl #(PMAP_DOMAIN_KERNEL * 2) /* Sw to kernel thread? */
 	beq     .Lcs_context_switched        /* Yup. Don't flush cache */
 	mrc	p15, 0, r0, c3, c0, 0		/* r0 = old DACR */
 	/*
 	 * Get the new L1 table pointer into r11.  If we're switching to
 	 * an LWP with the same address space as the outgoing one, we can
 	 * skip the cache purge and the TTB load.
 	 *
 	 * To avoid data dep stalls that would happen anyway, we try
 	 * and get some useful work done in the mean time.
 	 */
 	mrc	p15, 0, r10, c2, c0, 0		/* r10 = old L1 */
 	ldr	r11, [r9, #(PCB_PAGEDIR)]	/* r11 = new L1 */
 
 
 	teq	r10, r11			/* Same L1? */
 	cmpeq	r0, r5				/* Same DACR? */
 	beq	.Lcs_context_switched		/* yes! */
 
 	/*
 	 * Definately need to flush the cache.
 	 */
 
 	ldr	r1, .Lcpufuncs
 	mov	lr, pc
 	ldr	pc, [r1, #CF_IDCACHE_WBINV_ALL]
 .Lcs_cache_purge_skipped:
 	/* rem: r6 = lock */
 	/* rem: r9 = new PCB */
 	/* rem: r10 = old L1 */
 	/* rem: r11 = new L1 */
 
 	mov	r2, #0x00000000
 	ldr	r7, [r9, #(PCB_PL1VEC)]
 
 	/*
 	 * Ensure the vector table is accessible by fixing up the L1
 	 */
 	cmp	r7, #0			/* No need to fixup vector table? */
 	ldrne	r2, [r7]		/* But if yes, fetch current value */
 	ldrne	r0, [r9, #(PCB_L1VEC)]	/* Fetch new vector_page value */
 	mcr	p15, 0, r5, c3, c0, 0	/* Update DACR for new context */
 	cmpne	r2, r0			/* Stuffing the same value? */
 #ifndef PMAP_INCLUDE_PTE_SYNC
 	strne	r0, [r7]		/* Nope, update it */
 #else
 	beq	.Lcs_same_vector
 	str	r0, [r7]		/* Otherwise, update it */
 
 	/*
 	 * Need to sync the cache to make sure that last store is
 	 * visible to the MMU.
 	 */
 	ldr	r2, .Lcpufuncs
 	mov	r0, r7
 	mov	r1, #4
 	mov	lr, pc
 	ldr	pc, [r2, #CF_DCACHE_WB_RANGE]
 
 .Lcs_same_vector:
 #endif /* PMAP_INCLUDE_PTE_SYNC */
 
 	cmp	r10, r11		/* Switching to the same L1? */
 	ldr	r10, .Lcpufuncs
 	beq	.Lcs_same_l1		/* Yup. */
 	/*
 	 * Do a full context switch, including full TLB flush.
 	 */
 	mov	r0, r11
 	mov	lr, pc
 	ldr	pc, [r10, #CF_CONTEXT_SWITCH]
 
 	b	.Lcs_context_switched
 
 	/*
 	 * We're switching to a different process in the same L1.
 	 * In this situation, we only need to flush the TLB for the
 	 * vector_page mapping, and even then only if r7 is non-NULL.
 	 */
 .Lcs_same_l1:
 	cmp	r7, #0
 	movne	r0, #0			/* We *know* vector_page's VA is 0x0 */
 	movne	lr, pc
 	ldrne	pc, [r10, #CF_TLB_FLUSHID_SE]
 	/*
 	 * We can do that, since 
 	 * PSR_SVC32_MODE|PSR_UND32_MODE == MSR_UND32_MODE
 	 */
 
 .Lcs_context_switched:
 
 	/* Release the old thread */
 	str	r6, [r4, #TD_LOCK]
 	ldr	r6, .Lblocked_lock
 	ldr	r3, .Lcurthread
 	ldr	r3, [r3]
 
 1:
 	ldr	r4, [r3, #TD_LOCK]
 	cmp	r4, r6
 	beq	1b
 	
 	/* XXXSCW: Safe to re-enable FIQs here */
 
 	/* rem: r9 = new PCB */
 
         mrs	r3, cpsr
 	/*
 	 * We can do that, since 
 	 * PSR_SVC32_MODE|PSR_UND32_MODE == MSR_UND32_MODE
 	 */
 	orr	r2, r3, #(PSR_UND32_MODE)
 	msr	cpsr_c, r2
 
 	ldr	sp, [r9, #(PCB_UND_SP)]
 
         msr	cpsr_c, r3		/* Restore the old mode */
 	/* Restore all the save registers */
-#ifndef __XSCALE__
+#ifndef _ARM_ARCH_5E
 	add	r7, r9, #PCB_R8
 	ldmia	r7, {r8-r13}
 	sub	r7, r7, #PCB_R8		/* restore PCB pointer */
 #else
 	mov	r7, r9
 	ldr	r8, [r7, #(PCB_R8)]
 	ldr	r9, [r7, #(PCB_R9)]
 	ldr	r10, [r7, #(PCB_R10)]
 	ldr	r11, [r7, #(PCB_R11)]
 	ldr	r12, [r7, #(PCB_R12)]
 	ldr	r13, [r7, #(PCB_SP)]
 #endif
 
 	/* rem: r6 = lock */
 	/* rem: r7 = new pcb */
 
 #ifdef ARMFPE
 	add	r0, r7, #(USER_SIZE) & 0x00ff
 	add	r0, r0, #(USER_SIZE) & 0xff00 
 	bl	_C_LABEL(arm_fpe_core_changecontext)
 #endif
 
 	/* rem: r5 = new lwp's proc */
 	/* rem: r6 = lock */
 	/* rem: r7 = new PCB */
 
 .Lswitch_return:
 
 	/*
 	 * Pull the registers that got pushed when either savectx() or
 	 * cpu_switch() was called and return.
 	 */
 	ldmfd	sp!, {r4-r7, pc}
 #ifdef DIAGNOSTIC
 .Lswitch_bogons:
 	adr	r0, .Lswitch_panic_str
 	bl	_C_LABEL(panic)
 1:	nop
 	b	1b
 
 .Lswitch_panic_str:
 	.asciz	"cpu_switch: sched_qs empty with non-zero sched_whichqs!\n"
 #endif
 ENTRY(savectx)
 	stmfd   sp!, {r4-r7, lr}
 	/*
 	 * r0 = pcb
 	 */
 	/* Store all the registers in the process's pcb */
 	add	r2, r0, #(PCB_R8)
 	stmia	r2, {r8-r13}
 	ldmfd	sp!, {r4-r7, pc}
 
 ENTRY(fork_trampoline)
 	mov	r1, r5
 	mov	r2, sp
 	mov	r0, r4
 	mov	fp, #0
 	bl	_C_LABEL(fork_exit)
 	/* Kill irq"s */
 	mrs	r0, cpsr
 	orr	r0, r0, #(I32_bit|F32_bit)
 	msr	cpsr_c, r0
 	DO_AST
 	PULLFRAME
 
 	movs	pc, lr			/* Exit */
 
 AST_LOCALS