Index: stable/10/sys/arm/arm/cpufunc_asm_arm10.S
===================================================================
--- stable/10/sys/arm/arm/cpufunc_asm_arm10.S	(revision 269795)
+++ stable/10/sys/arm/arm/cpufunc_asm_arm10.S	(revision 269796)
@@ -1,276 +1,276 @@
 /*	$NetBSD: cpufunc_asm_arm10.S,v 1.1 2003/09/06 09:12:29 rearnsha Exp $	*/
 
 /*-
  * Copyright (c) 2002 ARM Limited
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the company may not be used to endorse or promote
  *    products derived from this software without specific prior written
  *    permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * ARM10 assembly functions for CPU / MMU / TLB specific operations
  *
  */
 
 #include <machine/asm.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * Functions to set the MMU Translation Table Base register
  *
  * We need to clean and flush the cache as it uses virtual
  * addresses that are about to change.
  */
 ENTRY(arm10_setttb)
 	stmfd	sp!, {r0, lr}
 	bl	_C_LABEL(arm10_idcache_wbinv_all)
 	ldmfd	sp!, {r0, lr}
 
 	mcr	p15, 0, r0, c2, c0, 0	/* load new TTB */
 
 	mcr	p15, 0, r0, c8, c7, 0	/* invalidate I+D TLBs */
 	bx	lr
 END(arm10_setttb)
 
 /*
  * TLB functions
  */
 ENTRY(arm10_tlb_flushID_SE)
 	mcr	p15, 0, r0, c8, c6, 1	/* flush D tlb single entry */
 	mcr	p15, 0, r0, c8, c5, 1	/* flush I tlb single entry */
 	bx	lr
 END(arm10_tlb_flushID_SE)
 
 ENTRY(arm10_tlb_flushI_SE)
 	mcr	p15, 0, r0, c8, c5, 1	/* flush I tlb single entry */
 	bx	lr
 END(arm10_tlb_flushI_SE)
 
 /*
  * Cache operations.  For the entire cache we use the set/index
  * operations.
  */
 	s_max	.req r0
 	i_max	.req r1
 	s_inc	.req r2
 	i_inc	.req r3
 
 ENTRY_NP(arm10_icache_sync_range)
 	ldr	ip, .Larm10_line_size
 	cmp	r1, #0x4000
 	bcs	.Larm10_icache_sync_all
 	ldr	ip, [ip]
 	sub	r3, ip, #1
 	and	r2, r0, r3
 	add	r1, r1, r2
 	bic	r0, r0, r3
 .Larm10_sync_next:
 	mcr	p15, 0, r0, c7, c5, 1	/* Invalidate I cache SE with VA */
 	mcr	p15, 0, r0, c7, c10, 1	/* Clean D cache SE with VA */
 	add	r0, r0, ip
 	subs	r1, r1, ip
 	bhi	.Larm10_sync_next
 	mcr	p15, 0, r0, c7, c10, 4	/* drain the write buffer */
 	bx	lr
 END(arm10_icache_sync_range)
 
 ENTRY_NP(arm10_icache_sync_all)
 .Larm10_icache_sync_all:
 	/*
 	 * We assume that the code here can never be out of sync with the
 	 * dcache, so that we can safely flush the Icache and fall through
 	 * into the Dcache cleaning code.
 	 */
 	mcr	p15, 0, r0, c7, c5, 0	/* Flush I cache */
 	/* Fall through to clean Dcache. */
 
 .Larm10_dcache_wb:
 	ldr	ip, .Larm10_cache_data
 	ldmia	ip, {s_max, i_max, s_inc, i_inc}
 .Lnext_set:
 	orr	ip, s_max, i_max
 .Lnext_index:
 	mcr	p15, 0, ip, c7, c10, 2	/* Clean D cache SE with Set/Index */
 	subs	ip, ip, i_inc
 	bhs	.Lnext_index		/* Next index */
 	subs	s_max, s_max, s_inc
 	bhs	.Lnext_set		/* Next set */
 	mcr	p15, 0, r0, c7, c10, 4	/* drain the write buffer */
 	bx	lr
 END(arm10_icache_sync_all)
 
 .Larm10_line_size:
 	.word	_C_LABEL(arm_pdcache_line_size)
 
 ENTRY(arm10_dcache_wb_range)
 	ldr	ip, .Larm10_line_size
 	cmp	r1, #0x4000
 	bcs	.Larm10_dcache_wb
 	ldr	ip, [ip]
 	sub	r3, ip, #1
 	and	r2, r0, r3
 	add	r1, r1, r2
 	bic	r0, r0, r3
 .Larm10_wb_next:
 	mcr	p15, 0, r0, c7, c10, 1	/* Clean D cache SE with VA */
 	add	r0, r0, ip
 	subs	r1, r1, ip
 	bhi	.Larm10_wb_next
 	mcr	p15, 0, r0, c7, c10, 4	/* drain the write buffer */
 	bx	lr
 END(arm10_dcache_wb_range)
 	
 ENTRY(arm10_dcache_wbinv_range)
 	ldr	ip, .Larm10_line_size
 	cmp	r1, #0x4000
 	bcs	.Larm10_dcache_wbinv_all
 	ldr	ip, [ip]
 	sub	r3, ip, #1
 	and	r2, r0, r3
 	add	r1, r1, r2
 	bic	r0, r0, r3
 .Larm10_wbinv_next:
 	mcr	p15, 0, r0, c7, c14, 1	/* Purge D cache SE with VA */
 	add	r0, r0, ip
 	subs	r1, r1, ip
 	bhi	.Larm10_wbinv_next
 	mcr	p15, 0, r0, c7, c10, 4	/* drain the write buffer */
 	bx	lr
 END(arm10_dcache_wbinv_range)
 	
 /*
  * Note, we must not invalidate everything.  If the range is too big we
  * must use wb-inv of the entire cache.
  */
 ENTRY(arm10_dcache_inv_range)
 	ldr	ip, .Larm10_line_size
 	cmp	r1, #0x4000
 	bcs	.Larm10_dcache_wbinv_all
 	ldr	ip, [ip]
 	sub	r3, ip, #1
 	and	r2, r0, r3
 	add	r1, r1, r2
 	bic	r0, r0, r3
 .Larm10_inv_next:
 	mcr	p15, 0, r0, c7, c6, 1	/* Invalidate D cache SE with VA */
 	add	r0, r0, ip
 	subs	r1, r1, ip
 	bhi	.Larm10_inv_next
 	mcr	p15, 0, r0, c7, c10, 4	/* drain the write buffer */
 	bx	lr
 END(arm10_dcache_inv_range)
 
 ENTRY(arm10_idcache_wbinv_range)
 	ldr	ip, .Larm10_line_size
 	cmp	r1, #0x4000
 	bcs	.Larm10_idcache_wbinv_all
 	ldr	ip, [ip]
 	sub	r3, ip, #1
 	and	r2, r0, r3
 	add	r1, r1, r2
 	bic	r0, r0, r3
 .Larm10_id_wbinv_next:
 	mcr	p15, 0, r0, c7, c5, 1	/* Invalidate I cache SE with VA */
 	mcr	p15, 0, r0, c7, c14, 1	/* Purge D cache SE with VA */
 	add	r0, r0, ip
 	subs	r1, r1, ip
 	bhi	.Larm10_id_wbinv_next
 	mcr	p15, 0, r0, c7, c10, 4	/* drain the write buffer */
 	bx	lr
 END(arm10_idcache_wbinv_range)
 
 ENTRY_NP(arm10_idcache_wbinv_all)
 .Larm10_idcache_wbinv_all:
 	/*
 	 * We assume that the code here can never be out of sync with the
 	 * dcache, so that we can safely flush the Icache and fall through
 	 * into the Dcache purging code.
 	 */
 	mcr	p15, 0, r0, c7, c5, 0	/* Flush I cache */
 	/* Fall through to purge Dcache. */
 
-ENTRY(arm10_dcache_wbinv_all)
+EENTRY(arm10_dcache_wbinv_all)
 .Larm10_dcache_wbinv_all:
 	ldr	ip, .Larm10_cache_data
 	ldmia	ip, {s_max, i_max, s_inc, i_inc}
 .Lnext_set_inv:
 	orr	ip, s_max, i_max
 .Lnext_index_inv:
 	mcr	p15, 0, ip, c7, c14, 2	/* Purge D cache SE with Set/Index */
 	subs	ip, ip, i_inc
 	bhs	.Lnext_index_inv		/* Next index */
 	subs	s_max, s_max, s_inc
 	bhs	.Lnext_set_inv		/* Next set */
 	mcr	p15, 0, r0, c7, c10, 4	/* drain the write buffer */
 	bx	lr
+EEND(arm10_dcache_wbinv_all)
 END(arm10_idcache_wbinv_all)
-END(arm10_dcache_wbinv_all)
 
 .Larm10_cache_data:
 	.word	_C_LABEL(arm10_dcache_sets_max)
 
 /*
  * Context switch.
  *
  * These is the CPU-specific parts of the context switcher cpu_switch()
  * These functions actually perform the TTB reload.
  *
  * NOTE: Special calling convention
  *	r1, r4-r13 must be preserved
  */
 ENTRY(arm10_context_switch)
 	/*
 	 * We can assume that the caches will only contain kernel addresses
 	 * at this point.  So no need to flush them again.
 	 */
 	mcr	p15, 0, r0, c7, c10, 4	/* drain the write buffer */
 	mcr	p15, 0, r0, c2, c0, 0	/* set the new TTB */
 	mcr	p15, 0, r0, c8, c7, 0	/* and flush the I+D tlbs */
 
 	/* Paranoia -- make sure the pipeline is empty. */
 	nop
 	nop
 	nop
 	bx	lr
 END(arm10_context_switch)
 
 	.bss
 
 /* XXX The following macros should probably be moved to asm.h */
 #define _DATA_OBJECT(x) .globl x; .type x,_ASM_TYPE_OBJECT; x:
 #define C_OBJECT(x)	_DATA_OBJECT(_C_LABEL(x))
 
 /*
  * Parameters for the cache cleaning code.  Note that the order of these
  * four variables is assumed in the code above.  Hence the reason for
  * declaring them in the assembler file.
  */
 	.align 0
 C_OBJECT(arm10_dcache_sets_max)
 	.space	4
 C_OBJECT(arm10_dcache_index_max)
 	.space	4
 C_OBJECT(arm10_dcache_sets_inc)
 	.space	4
 C_OBJECT(arm10_dcache_index_inc)
 	.space	4
Index: stable/10/sys/arm/arm/cpufunc_asm_arm9.S
===================================================================
--- stable/10/sys/arm/arm/cpufunc_asm_arm9.S	(revision 269795)
+++ stable/10/sys/arm/arm/cpufunc_asm_arm9.S	(revision 269796)
@@ -1,263 +1,263 @@
 /*	$NetBSD: cpufunc_asm_arm9.S,v 1.3 2004/01/26 15:54:16 rearnsha Exp $	*/
 
 /*
  * Copyright (c) 2001, 2004 ARM Limited
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the company may not be used to endorse or promote
  *    products derived from this software without specific prior written
  *    permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * ARM9 assembly functions for CPU / MMU / TLB specific operations
  */
 
 #include <machine/asm.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * Functions to set the MMU Translation Table Base register
  *
  * We need to clean and flush the cache as it uses virtual
  * addresses that are about to change.
  */
 ENTRY(arm9_setttb)
 	stmfd	sp!, {r0, lr}
 	bl	_C_LABEL(arm9_idcache_wbinv_all)
 	ldmfd	sp!, {r0, lr}
 
 	mcr	p15, 0, r0, c2, c0, 0	/* load new TTB */
 
 	mcr	p15, 0, r0, c8, c7, 0	/* invalidate I+D TLBs */
 	mov	pc, lr
 END(arm9_setttb)
 
 /*
  * TLB functions
  */
 ENTRY(arm9_tlb_flushID_SE)
 	mcr	p15, 0, r0, c8, c6, 1	/* flush D tlb single entry */
 	mcr	p15, 0, r0, c8, c5, 1	/* flush I tlb single entry */
 	mov	pc, lr
 END(arm9_tlb_flushID_SE)
 
 /*
  * Cache operations.  For the entire cache we use the set/index
  * operations.
  */
 	s_max	.req r0
 	i_max	.req r1
 	s_inc	.req r2
 	i_inc	.req r3
 
 ENTRY_NP(arm9_icache_sync_range)
 	ldr	ip, .Larm9_line_size
 	cmp	r1, #0x4000
 	bcs	.Larm9_icache_sync_all
 	ldr	ip, [ip]
 	sub	r3, ip, #1
 	and	r2, r0, r3
 	add	r1, r1, r2
 	bic	r0, r0, r3
 .Larm9_sync_next:
 	mcr	p15, 0, r0, c7, c5, 1	/* Invalidate I cache SE with VA */
 	mcr	p15, 0, r0, c7, c10, 1	/* Clean D cache SE with VA */
 	add	r0, r0, ip
 	subs	r1, r1, ip
 	bhi	.Larm9_sync_next
 	mov	pc, lr
 END(arm9_icache_sync_range)
 
 ENTRY_NP(arm9_icache_sync_all)
 .Larm9_icache_sync_all:
 	/*
 	 * We assume that the code here can never be out of sync with the
 	 * dcache, so that we can safely flush the Icache and fall through
 	 * into the Dcache cleaning code.
 	 */
 	mcr	p15, 0, r0, c7, c5, 0	/* Flush I cache */
 	/* Fall through to clean Dcache. */
 
 .Larm9_dcache_wb:
 	ldr	ip, .Larm9_cache_data
 	ldmia	ip, {s_max, i_max, s_inc, i_inc}
 .Lnext_set:
 	orr	ip, s_max, i_max
 .Lnext_index:
 	mcr	p15, 0, ip, c7, c10, 2	/* Clean D cache SE with Set/Index */
 	subs	ip, ip, i_inc
 	bhs	.Lnext_index		/* Next index */
 	subs	s_max, s_max, s_inc
 	bhs	.Lnext_set		/* Next set */
 	mov	pc, lr
 END(arm9_icache_sync_all)
 
 .Larm9_line_size:
 	.word	_C_LABEL(arm_pdcache_line_size)
 
 ENTRY(arm9_dcache_wb_range)
 	ldr	ip, .Larm9_line_size
 	cmp	r1, #0x4000
 	bcs	.Larm9_dcache_wb
 	ldr	ip, [ip]
 	sub	r3, ip, #1
 	and	r2, r0, r3
 	add	r1, r1, r2
 	bic	r0, r0, r3
 .Larm9_wb_next:
 	mcr	p15, 0, r0, c7, c10, 1	/* Clean D cache SE with VA */
 	add	r0, r0, ip
 	subs	r1, r1, ip
 	bhi	.Larm9_wb_next
 	mov	pc, lr
 END(arm9_dcache_wb_range)
 	
 ENTRY(arm9_dcache_wbinv_range)
 	ldr	ip, .Larm9_line_size
 	cmp	r1, #0x4000
 	bcs	.Larm9_dcache_wbinv_all
 	ldr	ip, [ip]
 	sub	r3, ip, #1
 	and	r2, r0, r3
 	add	r1, r1, r2
 	bic	r0, r0, r3
 .Larm9_wbinv_next:
 	mcr	p15, 0, r0, c7, c14, 1	/* Purge D cache SE with VA */
 	add	r0, r0, ip
 	subs	r1, r1, ip
 	bhi	.Larm9_wbinv_next
 	mov	pc, lr
 END(arm9_dcache_wbinv_range)
 	
 /*
  * Note, we must not invalidate everything.  If the range is too big we
  * must use wb-inv of the entire cache.
  */
 ENTRY(arm9_dcache_inv_range)
 	ldr	ip, .Larm9_line_size
 	cmp	r1, #0x4000
 	bcs	.Larm9_dcache_wbinv_all
 	ldr	ip, [ip]
 	sub	r3, ip, #1
 	and	r2, r0, r3
 	add	r1, r1, r2
 	bic	r0, r0, r3
 .Larm9_inv_next:
 	mcr	p15, 0, r0, c7, c6, 1	/* Invalidate D cache SE with VA */
 	add	r0, r0, ip
 	subs	r1, r1, ip
 	bhi	.Larm9_inv_next
 	mov	pc, lr
 END(arm9_dcache_inv_range)
 
 ENTRY(arm9_idcache_wbinv_range)
 	ldr	ip, .Larm9_line_size
 	cmp	r1, #0x4000
 	bcs	.Larm9_idcache_wbinv_all
 	ldr	ip, [ip]
 	sub	r3, ip, #1
 	and	r2, r0, r3
 	add	r1, r1, r2
 	bic	r0, r0, r3
 .Larm9_id_wbinv_next:
 	mcr	p15, 0, r0, c7, c5, 1	/* Invalidate I cache SE with VA */
 	mcr	p15, 0, r0, c7, c14, 1	/* Purge D cache SE with VA */
 	add	r0, r0, ip
 	subs	r1, r1, ip
 	bhi	.Larm9_id_wbinv_next
 	mov	pc, lr
 END(arm9_idcache_wbinv_range)
 
 ENTRY_NP(arm9_idcache_wbinv_all)
 .Larm9_idcache_wbinv_all:
 	/*
 	 * We assume that the code here can never be out of sync with the
 	 * dcache, so that we can safely flush the Icache and fall through
 	 * into the Dcache purging code.
 	 */
 	mcr	p15, 0, r0, c7, c5, 0	/* Flush I cache */
 	/* Fall through */
 
-ENTRY(arm9_dcache_wbinv_all)
+EENTRY(arm9_dcache_wbinv_all)
 .Larm9_dcache_wbinv_all:
 	ldr	ip, .Larm9_cache_data
 	ldmia	ip, {s_max, i_max, s_inc, i_inc}
 .Lnext_set_inv:
 	orr	ip, s_max, i_max
 .Lnext_index_inv:
 	mcr	p15, 0, ip, c7, c14, 2	/* Purge D cache SE with Set/Index */
 	subs	ip, ip, i_inc
 	bhs	.Lnext_index_inv		/* Next index */
 	subs	s_max, s_max, s_inc
 	bhs	.Lnext_set_inv		/* Next set */
 	mov	pc, lr
+EEND(arm9_dcache_wbinv_all)
 END(arm9_idcache_wbinv_all)
-END(arm9_dcache_wbinv_all)
 
 .Larm9_cache_data:
 	.word	_C_LABEL(arm9_dcache_sets_max)
 
 /*
  * Context switch.
  *
  * These is the CPU-specific parts of the context switcher cpu_switch()
  * These functions actually perform the TTB reload.
  *
  * NOTE: Special calling convention
  *	r1, r4-r13 must be preserved
  */
 ENTRY(arm9_context_switch)
 	/*
 	 * We can assume that the caches will only contain kernel addresses
 	 * at this point.  So no need to flush them again.
 	 */
 	mcr	p15, 0, r0, c7, c10, 4	/* drain the write buffer */
 	mcr	p15, 0, r0, c2, c0, 0	/* set the new TTB */
 	mcr	p15, 0, r0, c8, c7, 0	/* and flush the I+D tlbs */
 
 	/* Paranoia -- make sure the pipeline is empty. */
 	nop
 	nop
 	nop
 	mov	pc, lr
 END(arm9_context_switch)
 
 	.bss
 
 /* XXX The following macros should probably be moved to asm.h */
 #define _DATA_OBJECT(x) .globl x; .type x,_ASM_TYPE_OBJECT; x:
 #define C_OBJECT(x)	_DATA_OBJECT(_C_LABEL(x))
 
 /*
  * Parameters for the cache cleaning code.  Note that the order of these
  * four variables is assumed in the code above.  Hence the reason for
  * declaring them in the assembler file.
  */
 	.align 0
 C_OBJECT(arm9_dcache_sets_max)
 	.space	4
 C_OBJECT(arm9_dcache_index_max)
 	.space	4
 C_OBJECT(arm9_dcache_sets_inc)
 	.space	4
 C_OBJECT(arm9_dcache_index_inc)
 	.space	4
Index: stable/10/sys/arm/arm/cpufunc_asm_armv5.S
===================================================================
--- stable/10/sys/arm/arm/cpufunc_asm_armv5.S	(revision 269795)
+++ stable/10/sys/arm/arm/cpufunc_asm_armv5.S	(revision 269796)
@@ -1,247 +1,248 @@
 /*	$NetBSD: cpufunc_asm_armv5.S,v 1.3 2007/01/06 00:50:54 christos Exp $	*/
 
 /*
  * Copyright (c) 2002, 2005 ARM Limited
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the company may not be used to endorse or promote
  *    products derived from this software without specific prior written
  *    permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * ARMv5 assembly functions for manipulating caches.
  * These routines can be used by any core that supports the set/index
  * operations.
  */
 
 #include <machine/asm.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * Functions to set the MMU Translation Table Base register
  *
  * We need to clean and flush the cache as it uses virtual
  * addresses that are about to change.
  */
 ENTRY(armv5_setttb)
 	stmfd	sp!, {r0, lr}
 	bl	_C_LABEL(armv5_idcache_wbinv_all)
 	ldmfd	sp!, {r0, lr}
 
 	mcr	p15, 0, r0, c2, c0, 0	/* load new TTB */
 
 	mcr	p15, 0, r0, c8, c7, 0	/* invalidate I+D TLBs */
 	RET
 END(armv5_setttb)
 
 /*
  * Cache operations.  For the entire cache we use the set/index
  * operations.
  */
 	s_max	.req r0
 	i_max	.req r1
 	s_inc	.req r2
 	i_inc	.req r3
 
 ENTRY_NP(armv5_icache_sync_range)
 	ldr	ip, .Larmv5_line_size
 	cmp	r1, #0x4000
 	bcs	.Larmv5_icache_sync_all
 	ldr	ip, [ip]
 	sub	r1, r1, #1		/* Don't overrun */
 	sub	r3, ip, #1
 	and	r2, r0, r3
 	add	r1, r1, r2
 	bic	r0, r0, r3
 1:
 	mcr	p15, 0, r0, c7, c5, 1	/* Invalidate I cache SE with VA */
 	mcr	p15, 0, r0, c7, c10, 1	/* Clean D cache SE with VA */
 	add	r0, r0, ip
 	subs	r1, r1, ip
 	bpl	1b
 	mcr	p15, 0, r0, c7, c10, 4	/* drain the write buffer */
 	RET
 END(armv5_icache_sync_range)
 
 ENTRY_NP(armv5_icache_sync_all)
 .Larmv5_icache_sync_all:
 	/*
 	 * We assume that the code here can never be out of sync with the
 	 * dcache, so that we can safely flush the Icache and fall through
 	 * into the Dcache cleaning code.
 	 */
 	mcr	p15, 0, r0, c7, c5, 0	/* Flush I cache */
 	/* Fall through to clean Dcache. */
 
 .Larmv5_dcache_wb:
 	ldr	ip, .Larmv5_cache_data
 	ldmia	ip, {s_max, i_max, s_inc, i_inc}
 1:
 	orr	ip, s_max, i_max
 2:
 	mcr	p15, 0, ip, c7, c10, 2	/* Clean D cache SE with Set/Index */
 	sub	ip, ip, i_inc
 	tst	ip, i_max		/* Index 0 is last one */
 	bne	2b			/* Next index */
 	mcr	p15, 0, ip, c7, c10, 2	/* Clean D cache SE with Set/Index */
 	subs	s_max, s_max, s_inc
 	bpl	1b			/* Next set */
 	mcr	p15, 0, r0, c7, c10, 4	/* drain the write buffer */
 	RET
 END(armv5_icache_sync_all)
 
 .Larmv5_line_size:
 	.word	_C_LABEL(arm_pdcache_line_size)
 
 ENTRY(armv5_dcache_wb_range)
 	ldr	ip, .Larmv5_line_size
 	cmp	r1, #0x4000
 	bcs	.Larmv5_dcache_wb
 	ldr	ip, [ip]
 	sub	r1, r1, #1		/* Don't overrun */
 	sub	r3, ip, #1
 	and	r2, r0, r3
 	add	r1, r1, r2
 	bic	r0, r0, r3
 1:
 	mcr	p15, 0, r0, c7, c10, 1	/* Clean D cache SE with VA */
 	add	r0, r0, ip
 	subs	r1, r1, ip
 	bpl	1b
 	mcr	p15, 0, r0, c7, c10, 4	/* drain the write buffer */
 	RET
 END(armv5_dcache_wb_range)
 	
 ENTRY(armv5_dcache_wbinv_range)
 	ldr	ip, .Larmv5_line_size
 	cmp	r1, #0x4000
 	bcs	.Larmv5_dcache_wbinv_all
 	ldr	ip, [ip]
 	sub	r1, r1, #1		/* Don't overrun */
 	sub	r3, ip, #1
 	and	r2, r0, r3
 	add	r1, r1, r2
 	bic	r0, r0, r3
 1:
 	mcr	p15, 0, r0, c7, c14, 1	/* Purge D cache SE with VA */
 	add	r0, r0, ip
 	subs	r1, r1, ip
 	bpl	1b
 	mcr	p15, 0, r0, c7, c10, 4	/* drain the write buffer */
 	RET
 END(armv5_dcache_wbinv_range)
 	
 /*
  * Note, we must not invalidate everything.  If the range is too big we
  * must use wb-inv of the entire cache.
  */
 ENTRY(armv5_dcache_inv_range)
 	ldr	ip, .Larmv5_line_size
 	cmp	r1, #0x4000
 	bcs	.Larmv5_dcache_wbinv_all
 	ldr	ip, [ip]
 	sub	r1, r1, #1		/* Don't overrun */
 	sub	r3, ip, #1
 	and	r2, r0, r3
 	add	r1, r1, r2
 	bic	r0, r0, r3
 1:
 	mcr	p15, 0, r0, c7, c6, 1	/* Invalidate D cache SE with VA */
 	add	r0, r0, ip
 	subs	r1, r1, ip
 	bpl	1b
 	mcr	p15, 0, r0, c7, c10, 4	/* drain the write buffer */
 	RET
 END(armv5_dcache_inv_range)
 
 ENTRY(armv5_idcache_wbinv_range)
 	ldr	ip, .Larmv5_line_size
 	cmp	r1, #0x4000
 	bcs	.Larmv5_idcache_wbinv_all
 	ldr	ip, [ip]
 	sub	r1, r1, #1		/* Don't overrun */
 	sub	r3, ip, #1
 	and	r2, r0, r3
 	add	r1, r1, r2
 	bic	r0, r0, r3
 1:
 	mcr	p15, 0, r0, c7, c5, 1	/* Invalidate I cache SE with VA */
 	mcr	p15, 0, r0, c7, c14, 1	/* Purge D cache SE with VA */
 	add	r0, r0, ip
 	subs	r1, r1, ip
 	bpl	1b
 	mcr	p15, 0, r0, c7, c10, 4	/* drain the write buffer */
 	RET
 END(armv5_idcache_wbinv_range)
 
 ENTRY_NP(armv5_idcache_wbinv_all)
+armv5_idcache_wbinv_all:
 .Larmv5_idcache_wbinv_all:
 	/*
 	 * We assume that the code here can never be out of sync with the
 	 * dcache, so that we can safely flush the Icache and fall through
 	 * into the Dcache purging code.
 	 */
 	mcr	p15, 0, r0, c7, c5, 0	/* Flush I cache */
 	/* Fall through to purge Dcache. */
 
-ENTRY(armv5_dcache_wbinv_all)
+EENTRY(armv5_dcache_wbinv_all)
 .Larmv5_dcache_wbinv_all:
 	ldr	ip, .Larmv5_cache_data
 	ldmia	ip, {s_max, i_max, s_inc, i_inc}
 1:
 	orr	ip, s_max, i_max
 2:
 	mcr	p15, 0, ip, c7, c14, 2	/* Purge D cache SE with Set/Index */
 	sub	ip, ip, i_inc
 	tst	ip, i_max		/* Index 0 is last one */
 	bne	2b			/* Next index */
 	mcr	p15, 0, ip, c7, c14, 2	/* Purge D cache SE with Set/Index */
 	subs	s_max, s_max, s_inc
 	bpl	1b			/* Next set */
 	mcr	p15, 0, r0, c7, c10, 4	/* drain the write buffer */
 	RET
+EEND(armv5_dcache_wbinv_all)
 END(armv5_idcache_wbinv_all)
-END(armv5_dcache_wbinv_all)
 
 .Larmv5_cache_data:
 	.word	_C_LABEL(armv5_dcache_sets_max)
 
 	.bss
 
 /* XXX The following macros should probably be moved to asm.h */
 #define _DATA_OBJECT(x) .globl x; .type x,_ASM_TYPE_OBJECT; x:
 #define C_OBJECT(x)	_DATA_OBJECT(_C_LABEL(x))
 
 /*
  * Parameters for the cache cleaning code.  Note that the order of these
  * four variables is assumed in the code above.  Hence the reason for
  * declaring them in the assembler file.
  */
 	.align 0
 C_OBJECT(armv5_dcache_sets_max)
 	.space	4
 C_OBJECT(armv5_dcache_index_max)
 	.space	4
 C_OBJECT(armv5_dcache_sets_inc)
 	.space	4
 C_OBJECT(armv5_dcache_index_inc)
 	.space	4
Index: stable/10/sys/arm/arm/cpufunc_asm_armv6.S
===================================================================
--- stable/10/sys/arm/arm/cpufunc_asm_armv6.S	(revision 269795)
+++ stable/10/sys/arm/arm/cpufunc_asm_armv6.S	(revision 269796)
@@ -1,152 +1,152 @@
 /*	$NetBSD: cpufunc_asm_armv6.S,v 1.4 2010/12/10 02:06:22 bsh Exp $	*/
 
 /*
  * Copyright (c) 2002, 2005 ARM Limited
  * Portions Copyright (c) 2007 Microsoft
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the company may not be used to endorse or promote
  *    products derived from this software without specific prior written
  *    permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * ARMv6 assembly functions for manipulating caches.
  * These routines can be used by any core that supports the mcrr address
  * range operations.
  */
 
 /*
  * $FreeBSD$
  */
  
 #include <machine/asm.h>
 
 	.arch	armv6
 
 /*
  * Functions to set the MMU Translation Table Base register
  *
  * We need to clean and flush the cache as it uses virtual
  * addresses that are about to change.
  */
 ENTRY(armv6_setttb)
 	mcr	p15, 0, r0, c7, c10, 4	/* drain the write buffer */
 
 	mcr	p15, 0, r0, c2, c0, 0	/* load new TTB */
 
 	mcr	p15, 0, r0, c8, c7, 0	/* invalidate I+D TLBs */
 	RET
 END(armv6_setttb)
 
 /*
  * Cache operations.
  */
 
 /* LINTSTUB: void armv6_icache_sync_range(vaddr_t, vsize_t); */
 ENTRY_NP(armv6_icache_sync_range)
 	add	r1, r1, r0
 	sub	r1, r1, #1
 	mcrr	p15, 0, r1, r0, c5	/* invalidate I cache range */
 	mcrr	p15, 0, r1, r0, c12	/* clean D cache range */
 	mcr	p15, 0, r0, c7, c10, 4	/* drain the write buffer */
 	RET
 END(armv6_icache_sync_range)
 
 /* LINTSTUB: void armv6_icache_sync_all(void); */
 ENTRY_NP(armv6_icache_sync_all)
 	/*
 	 * We assume that the code here can never be out of sync with the
 	 * dcache, so that we can safely flush the Icache and fall through
 	 * into the Dcache cleaning code.
 	 */
 	mcr	p15, 0, r0, c7, c5, 0	/* Flush I cache */
 	mcr	p15, 0, r0, c7, c10, 0	/* Clean D cache */
 	mcr	p15, 0, r0, c7, c10, 4	/* drain the write buffer */
 	RET
 END(armv6_icache_sync_all)
 
 /* LINTSTUB: void armv6_dcache_wb_range(vaddr_t, vsize_t); */
 ENTRY(armv6_dcache_wb_range)
 	add	r1, r1, r0
 	sub	r1, r1, #1
 	mcrr	p15, 0, r1, r0, c12	/* clean D cache range */
 	mcr	p15, 0, r0, c7, c10, 4	/* drain the write buffer */
 	RET
 END(armv6_dcache_wb_range)
 	
 /* LINTSTUB: void armv6_dcache_wbinv_range(vaddr_t, vsize_t); */
 ENTRY(armv6_dcache_wbinv_range)
 	add	r1, r1, r0
 	sub	r1, r1, #1
 	mcrr	p15, 0, r1, r0, c14	/* clean and invaliate D cache range */
 	mcr	p15, 0, r0, c7, c10, 4	/* drain the write buffer */
 	RET
 END(armv6_dcache_wbinv_range)
 	
 /*
  * Note, we must not invalidate everything.  If the range is too big we
  * must use wb-inv of the entire cache.
  *
  * LINTSTUB: void armv6_dcache_inv_range(vaddr_t, vsize_t);
  */
 ENTRY(armv6_dcache_inv_range)
 	add	r1, r1, r0
 	sub	r1, r1, #1
 	mcrr	p15, 0, r1, r0, c6	/* invaliate D cache range */
 	mcr	p15, 0, r0, c7, c10, 4	/* drain the write buffer */
 	RET
 END(armv6_dcache_inv_range)
 
 /* LINTSTUB: void armv6_idcache_wbinv_range(vaddr_t, vsize_t); */
 ENTRY(armv6_idcache_wbinv_range)
 	add	r1, r1, r0
 	sub	r1, r1, #1
 	mcrr	p15, 0, r1, r0, c5	/* invaliate I cache range */
 	mcrr	p15, 0, r1, r0, c14	/* clean & invaliate D cache range */
 	mcr	p15, 0, r0, c7, c10, 4	/* drain the write buffer */
 	RET
 END(armv6_idcache_wbinv_range)
 
 /* LINTSTUB: void armv6_idcache_wbinv_all(void); */
 ENTRY_NP(armv6_idcache_wbinv_all)
 	/*
 	 * We assume that the code here can never be out of sync with the
 	 * dcache, so that we can safely flush the Icache and fall through
 	 * into the Dcache purging code.
 	 */
 	mcr	p15, 0, r0, c7, c5, 0	/* Flush I cache */
 	/* Fall through to purge Dcache. */
 
 /* LINTSTUB: void armv6_dcache_wbinv_all(void); */
-ENTRY(armv6_dcache_wbinv_all)
+EENTRY(armv6_dcache_wbinv_all)
 	mcr	p15, 0, r0, c7, c14, 0	/* clean & invalidate D cache */
 	mcr	p15, 0, r0, c7, c10, 4	/* drain the write buffer */
 	RET
+EEND(armv6_dcache_wbinv_all)
 END(armv6_idcache_wbinv_all)
-END(armv6_dcache_wbinv_all)
 
 ENTRY(armv6_idcache_inv_all)
 	mov	r0, #0
 	mcr	p15, 0, r0, c7, c7, 0	/* invalidate all I+D cache */
 	RET
 END(armv6_idcache_inv_all)
 
Index: stable/10/sys/arm/arm/cpufunc_asm_armv7.S
===================================================================
--- stable/10/sys/arm/arm/cpufunc_asm_armv7.S	(revision 269795)
+++ stable/10/sys/arm/arm/cpufunc_asm_armv7.S	(revision 269796)
@@ -1,368 +1,368 @@
 /*-
  * Copyright (c) 2010 Per Odlund <per.odlund@armagedon.se>
  * Copyright (C) 2011 MARVELL INTERNATIONAL LTD.
  * All rights reserved.
  *
  * Developed by Semihalf.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of MARVELL nor the names of contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <machine/asm.h>
 __FBSDID("$FreeBSD$");
 
 	.cpu cortex-a8
 
 .Lcoherency_level:
 	.word	_C_LABEL(arm_cache_loc)
 .Lcache_type:
 	.word	_C_LABEL(arm_cache_type)
 .Lway_mask:
 	.word	0x3ff
 .Lmax_index:
 	.word	0x7fff
 .Lpage_mask:
 	.word	0xfff
 
 #define PT_NOS          (1 << 5)
 #define PT_S 	        (1 << 1)
 #define PT_INNER_NC	0
 #define PT_INNER_WT	(1 << 0)
 #define PT_INNER_WB	((1 << 0) | (1 << 6))
 #define PT_INNER_WBWA	(1 << 6)
 #define PT_OUTER_NC	0
 #define PT_OUTER_WT	(2 << 3)
 #define PT_OUTER_WB	(3 << 3)
 #define PT_OUTER_WBWA	(1 << 3)
 	
 #ifdef SMP
 #define PT_ATTR	(PT_S|PT_INNER_WBWA|PT_OUTER_WBWA|PT_NOS)
 #else
 #define PT_ATTR	(PT_INNER_WBWA|PT_OUTER_WBWA)
 #endif
 
 ENTRY(armv7_setttb)
 	stmdb   sp!, {r0, lr}
  	bl      _C_LABEL(armv7_idcache_wbinv_all) /* clean the D cache */
  	ldmia   sp!, {r0, lr}
  	dsb
 				
 	orr 	r0, r0, #PT_ATTR
  	mcr	p15, 0, r0, c2, c0, 0	/* Translation Table Base Register 0 (TTBR0) */
 	isb
 #ifdef SMP
  	mcr     p15, 0, r0, c8, c3, 0   /* invalidate I+D TLBs Inner Shareable*/
 #else
  	mcr     p15, 0, r0, c8, c7, 0   /* invalidate I+D TLBs */
 #endif
  	dsb
  	isb
 	RET
 END(armv7_setttb)
 
 ENTRY(armv7_tlb_flushID)
 	dsb
 #ifdef SMP
 	mcr	p15, 0, r0, c8, c3, 0	/* flush Unified TLB all entries Inner Shareable */
 	mcr	p15, 0, r0, c7, c1, 6	/* flush BTB Inner Shareable */
 #else
 	mcr	p15, 0, r0, c8, c7, 0	/* flush Unified TLB all entries */
 	mcr	p15, 0, r0, c7, c5, 6	/* flush BTB */
 #endif
 	dsb
 	isb
 	mov	pc, lr
 END(armv7_tlb_flushID)
 
 ENTRY(armv7_tlb_flushID_SE)
 	ldr	r1, .Lpage_mask
 	bic	r0, r0, r1
 #ifdef SMP
 	mcr	p15, 0, r0, c8, c3, 3	/* flush Unified TLB single entry Inner Shareable */
 	mcr	p15, 0, r0, c7, c1, 6	/* flush BTB Inner Shareable */
 #else
 	mcr	p15, 0, r0, c8, c7, 1	/* flush Unified TLB single entry */
 	mcr	p15, 0, r0, c7, c5, 6	/* flush BTB */
 #endif
 	dsb
 	isb
 	mov	pc, lr
 END(armv7_tlb_flushID_SE)
 
 /* Based on algorithm from ARM Architecture Reference Manual */
 ENTRY(armv7_dcache_wbinv_all)
 	stmdb	sp!, {r4, r5, r6, r7, r8, r9}
 
 	/* Get cache level */
 	ldr	r0, .Lcoherency_level
 	ldr	r3, [r0]
 	cmp	r3, #0
 	beq	Finished
 	/* For each cache level */
 	mov	r8, #0
 Loop1:
 	/* Get cache type for given level */
 	mov	r2, r8, lsl #2
 	add	r2, r2, r2
 	ldr	r0, .Lcache_type
 	ldr	r1, [r0, r2]
 
 	/* Get line size */
 	and	r2, r1, #7
 	add	r2, r2, #4
 
 	/* Get number of ways */
 	ldr	r4, .Lway_mask
 	ands	r4, r4, r1, lsr #3
 	clz	r5, r4
 
 	/* Get max index */
 	ldr	r7, .Lmax_index
 	ands	r7, r7, r1, lsr #13
 Loop2:
 	mov	r9, r4
 Loop3:
 	mov	r6, r8, lsl #1
 	orr	r6, r6, r9, lsl r5
 	orr	r6, r6, r7, lsl r2
 
 	/* Clean and invalidate data cache by way/index */
 	mcr	p15, 0, r6, c7, c14, 2
 	subs	r9, r9, #1
 	bge	Loop3
 	subs	r7, r7, #1
 	bge	Loop2
 Skip:
 	add	r8, r8, #1
 	cmp	r3, r8
 	bne Loop1
 Finished:
 	dsb
 	ldmia	sp!, {r4, r5, r6, r7, r8, r9}
 	RET
 END(armv7_dcache_wbinv_all)
 
 ENTRY(armv7_idcache_wbinv_all)
 	stmdb	sp!, {lr}
 	bl armv7_dcache_wbinv_all
 #ifdef SMP
 	mcr	p15, 0, r0, c7, c1, 0	/* Invalidate all I caches to PoU (ICIALLUIS) */
 #else
 	mcr	p15, 0, r0, c7, c5, 0	/* Invalidate all I caches to PoU (ICIALLU) */
 #endif
 	dsb
 	isb
 	ldmia	sp!, {lr}
 	RET
 END(armv7_idcache_wbinv_all)
 
 /* XXX Temporary set it to 32 for MV cores, however this value should be
  * get from Cache Type register
  */
 .Larmv7_line_size:
 	.word	32
 
 ENTRY(armv7_dcache_wb_range)
 	ldr	ip, .Larmv7_line_size
 	sub	r3, ip, #1
 	and	r2, r0, r3
 	add	r1, r1, r2
 	bic	r0, r0, r3
 .Larmv7_wb_next:
 	mcr	p15, 0, r0, c7, c10, 1	/* Clean D cache SE with VA */
 	add	r0, r0, ip
 	subs	r1, r1, ip
 	bhi	.Larmv7_wb_next
 	dsb				/* data synchronization barrier */
 	RET
 END(armv7_dcache_wb_range)
 
 ENTRY(armv7_dcache_wbinv_range)
 	ldr	ip, .Larmv7_line_size
 	sub     r3, ip, #1
 	and     r2, r0, r3
 	add     r1, r1, r2
 	bic     r0, r0, r3
 .Larmv7_wbinv_next:
 	mcr	p15, 0, r0, c7, c14, 1	/* Purge D cache SE with VA */
 	add	r0, r0, ip
 	subs	r1, r1, ip
 	bhi	.Larmv7_wbinv_next
 	dsb				/* data synchronization barrier */
 	RET
 END(armv7_dcache_wbinv_range)
 
 /*
  * Note, we must not invalidate everything.  If the range is too big we
  * must use wb-inv of the entire cache.
  */
 ENTRY(armv7_dcache_inv_range)
 	ldr	ip, .Larmv7_line_size
 	sub     r3, ip, #1
 	and     r2, r0, r3
 	add     r1, r1, r2
 	bic     r0, r0, r3
 .Larmv7_inv_next:
 	mcr	p15, 0, r0, c7, c6, 1	/* Invalidate D cache SE with VA */
 	add	r0, r0, ip
 	subs	r1, r1, ip
 	bhi	.Larmv7_inv_next
 	dsb				/* data synchronization barrier */
 	RET
 END(armv7_dcache_inv_range)
 
 ENTRY(armv7_idcache_wbinv_range)
 	ldr	ip, .Larmv7_line_size
 	sub     r3, ip, #1
 	and     r2, r0, r3
 	add     r1, r1, r2
 	bic     r0, r0, r3
 .Larmv7_id_wbinv_next:
 	mcr	p15, 0, r0, c7, c5, 1	/* Invalidate I cache SE with VA */
 	mcr	p15, 0, r0, c7, c14, 1	/* Purge D cache SE with VA */
 	add	r0, r0, ip
 	subs	r1, r1, ip
 	bhi	.Larmv7_id_wbinv_next
 	isb				/* instruction synchronization barrier */
 	dsb				/* data synchronization barrier */
 	RET
 END(armv7_idcache_wbinv_range)
 
 ENTRY_NP(armv7_icache_sync_all)
 #ifdef SMP
 	mcr	p15, 0, r0, c7, c1, 0	/* Invalidate all I cache to PoU Inner Shareable */
 #else
 	mcr	p15, 0, r0, c7, c5, 0	/* Invalidate all I cache to PoU (ICIALLU) */
 #endif
 	isb				/* instruction synchronization barrier */
 	dsb				/* data synchronization barrier */
 	RET
 END(armv7_icache_sync_all)
 
 ENTRY_NP(armv7_icache_sync_range)
 	ldr	ip, .Larmv7_line_size
 .Larmv7_sync_next:
 	mcr	p15, 0, r0, c7, c5, 1	/* Invalidate I cache SE with VA */
 	mcr	p15, 0, r0, c7, c10, 1	/* Clean D cache SE with VA */
 	add	r0, r0, ip
 	subs	r1, r1, ip
 	bhi	.Larmv7_sync_next
 	isb				/* instruction synchronization barrier */
 	dsb				/* data synchronization barrier */
 	RET
 END(armv7_icache_sync_range)
 
 ENTRY(armv7_cpu_sleep)
 	dsb				/* data synchronization barrier */
 	wfi  				/* wait for interrupt */
 	RET
 END(armv7_cpu_sleep)
 
 ENTRY(armv7_context_switch)
 	dsb
 	orr     r0, r0, #PT_ATTR
 			
 	mcr	p15, 0, r0, c2, c0, 0	/* set the new TTB */
 	isb
 #ifdef SMP
 	mcr	p15, 0, r0, c8, c3, 0	/* and flush the I+D tlbs Inner Sharable */
 #else
 	mcr	p15, 0, r0, c8, c7, 0	/* and flush the I+D tlbs */
 #endif
 	dsb
 	isb
 	RET
 END(armv7_context_switch)
 
 ENTRY(armv7_drain_writebuf)
 	dsb
 	RET
 END(armv7_drain_writebuf)
 
 ENTRY(armv7_sev)
 	dsb
 	sev
 	nop
 	RET
 END(armv7_sev)
 
 ENTRY(armv7_auxctrl)
 	mrc p15, 0, r2, c1, c0, 1
 	bic r3, r2, r0	/* Clear bits */
 	eor r3, r3, r1  /* XOR bits */
 
 	teq r2, r3
 	mcrne p15, 0, r3, c1, c0, 1
 	mov r0, r2
 	RET
 END(armv7_auxctrl)
 
 /*
  * Invalidate all I+D+branch cache.  Used by startup code, which counts
  * on the fact that only r0-r3,ip are modified and no stack space is used.
  */
 ENTRY(armv7_idcache_inv_all)
 	mov     r0, #0
 	mcr     p15, 2, r0, c0, c0, 0   @ set cache level to L1
 	mrc     p15, 1, r0, c0, c0, 0   @ read CCSIDR
 
 	ubfx    r2, r0, #13, #15        @ get num sets - 1 from CCSIDR
 	ubfx    r3, r0, #3, #10         @ get numways - 1 from CCSIDR
 	clz     r1, r3                  @ number of bits to MSB of way
 	lsl     r3, r3, r1              @ shift into position
 	mov     ip, #1                  @
 	lsl     ip, ip, r1              @ ip now contains the way decr
 
 	ubfx    r0, r0, #0, #3          @ get linesize from CCSIDR
 	add     r0, r0, #4              @ apply bias
 	lsl     r2, r2, r0              @ shift sets by log2(linesize)
 	add     r3, r3, r2              @ merge numsets - 1 with numways - 1
 	sub     ip, ip, r2              @ subtract numsets - 1 from way decr
 	mov     r1, #1
 	lsl     r1, r1, r0              @ r1 now contains the set decr
 	mov     r2, ip                  @ r2 now contains set way decr
 
 	/* r3 = ways/sets, r2 = way decr, r1 = set decr, r0 and ip are free */
 1:      mcr     p15, 0, r3, c7, c6, 2   @ invalidate line
 	movs    r0, r3                  @ get current way/set
 	beq     2f                      @ at 0 means we are done.
 	movs    r0, r0, lsl #10         @ clear way bits leaving only set bits
 	subne   r3, r3, r1              @ non-zero?, decrement set #
 	subeq   r3, r3, r2              @ zero?, decrement way # and restore set count
 	b       1b
 
 2:	dsb                             @ wait for stores to finish
 	mov     r0, #0                  @ and ...
 	mcr     p15, 0, r0, c7, c5, 0   @ invalidate instruction+branch cache
 	isb                             @ instruction sync barrier
 	bx      lr                      @ return
-END(armv7_l1cache_inv_all)
+END(armv7_idcache_inv_all)
 
 ENTRY_NP(armv7_sleep)
 	dsb
 	wfi
 	bx	lr
 END(armv7_sleep)
 
Index: stable/10/sys/arm/arm/cpufunc_asm_xscale.S
===================================================================
--- stable/10/sys/arm/arm/cpufunc_asm_xscale.S	(revision 269795)
+++ stable/10/sys/arm/arm/cpufunc_asm_xscale.S	(revision 269796)
@@ -1,522 +1,523 @@
 /*	$NetBSD: cpufunc_asm_xscale.S,v 1.16 2002/08/17 16:36:32 thorpej Exp $	*/
 
 /*-
  * Copyright (c) 2001, 2002 Wasabi Systems, Inc.
  * All rights reserved.
  *
  * Written by Allen Briggs and Jason R. Thorpe for Wasabi Systems, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed for the NetBSD Project by
  *	Wasabi Systems, Inc.
  * 4. The name of Wasabi Systems, Inc. may not be used to endorse
  *    or promote products derived from this software without specific prior
  *    written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  *
  */
 
 /*-
  * Copyright (c) 2001 Matt Thomas.
  * Copyright (c) 1997,1998 Mark Brinicombe.
  * Copyright (c) 1997 Causality Limited
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by Causality Limited.
  * 4. The name of Causality Limited may not be used to endorse or promote
  *    products derived from this software without specific prior written
  *    permission.
  *
  * THIS SOFTWARE IS PROVIDED BY CAUSALITY LIMITED ``AS IS'' AND ANY EXPRESS
  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  * DISCLAIMED. IN NO EVENT SHALL CAUSALITY LIMITED BE LIABLE FOR ANY DIRECT,
  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * XScale assembly functions for CPU / MMU / TLB specific operations
  */
 
 #include <machine/asm.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * Size of the XScale core D-cache.
  */
 #define	DCACHE_SIZE		0x00008000
 
 .Lblock_userspace_access:
 	.word	_C_LABEL(block_userspace_access)
 
 /*
  * CPWAIT -- Canonical method to wait for CP15 update.
  * From: Intel 80200 manual, section 2.3.3.
  *
  * NOTE: Clobbers the specified temp reg.
  */
 #define	CPWAIT_BRANCH							 \
 	sub	pc, pc, #4
 
 #define	CPWAIT(tmp)							 \
 	mrc	p15, 0, tmp, c2, c0, 0	/* arbitrary read of CP15 */	;\
 	mov	tmp, tmp		/* wait for it to complete */	;\
 	CPWAIT_BRANCH			/* branch to next insn */
 
 #define	CPWAIT_AND_RETURN_SHIFTER	lsr #32
 
 #define	CPWAIT_AND_RETURN(tmp)						 \
 	mrc	p15, 0, tmp, c2, c0, 0	/* arbitrary read of CP15 */	;\
 	/* Wait for it to complete and branch to the return address */	 \
 	sub	pc, lr, tmp, CPWAIT_AND_RETURN_SHIFTER
 
 ENTRY(xscale_cpwait)
 	CPWAIT_AND_RETURN(r0)
 END(xscale_cpwait)
 
 /*
  * We need a separate cpu_control() entry point, since we have to
  * invalidate the Branch Target Buffer in the event the BPRD bit
  * changes in the control register.
  */
 ENTRY(xscale_control)
 	mrc	p15, 0, r3, c1, c0, 0	/* Read the control register */
 	bic	r2, r3, r0		/* Clear bits */
 	eor	r2, r2, r1		/* XOR bits */
 
 	teq	r2, r3			/* Only write if there was a change */
 	mcrne	p15, 0, r0, c7, c5, 6	/* Invalidate the BTB */
 	mcrne	p15, 0, r2, c1, c0, 0	/* Write new control register */
 	mov	r0, r3			/* Return old value */
 
 	CPWAIT_AND_RETURN(r1)
 END(xscale_control)
 
 /*
  * Functions to set the MMU Translation Table Base register
  *
  * We need to clean and flush the cache as it uses virtual
  * addresses that are about to change.
  */
 ENTRY(xscale_setttb)
 #ifdef CACHE_CLEAN_BLOCK_INTR
 	mrs	r3, cpsr
 	orr	r1, r3, #(I32_bit | F32_bit)
 	msr	cpsr_fsxc, r1
 #else
 	ldr	r3, .Lblock_userspace_access
 	ldr	r2, [r3]
 	orr	r1, r2, #1
 	str	r1, [r3]
 #endif
 	stmfd	sp!, {r0-r3, lr}
 	bl	_C_LABEL(xscale_cache_cleanID)
 	mcr	p15, 0, r0, c7, c5, 0	/* invalidate I$ and BTB */
 	mcr	p15, 0, r0, c7, c10, 4	/* drain write and fill buffer */
 
 	CPWAIT(r0)
 
 	ldmfd	sp!, {r0-r3, lr}
 
 	/* Write the TTB */
 	mcr	p15, 0, r0, c2, c0, 0
 
 	/* If we have updated the TTB we must flush the TLB */
 	mcr	p15, 0, r0, c8, c7, 0	/* invalidate I+D TLB */
 
 	/* The cleanID above means we only need to flush the I cache here */
 	mcr	p15, 0, r0, c7, c5, 0	/* invalidate I$ and BTB */
 
 	CPWAIT(r0)
 
 #ifdef CACHE_CLEAN_BLOCK_INTR
 	msr	cpsr_fsxc, r3
 #else
 	str	r2, [r3]
 #endif
 	RET
 END(xscale_setttb)
 
 /*
  * TLB functions
  *
  */
 ENTRY(xscale_tlb_flushID_SE)
 	mcr	p15, 0, r0, c8, c6, 1	/* flush D tlb single entry */
 	mcr	p15, 0, r0, c8, c5, 1	/* flush I tlb single entry */
 	CPWAIT_AND_RETURN(r0)
 END(xscale_tlb_flushID_SE)
 
 /*
  * Cache functions
  */
 ENTRY(xscale_cache_flushID)
 	mcr	p15, 0, r0, c7, c7, 0	/* flush I+D cache */
 	CPWAIT_AND_RETURN(r0)
 END(xscale_cache_flushID)
 
 ENTRY(xscale_cache_flushI)
 	mcr	p15, 0, r0, c7, c5, 0	/* flush I cache */
 	CPWAIT_AND_RETURN(r0)
 END(xscale_cache_flushI)
 
 ENTRY(xscale_cache_flushD)
 	mcr	p15, 0, r0, c7, c6, 0	/* flush D cache */
 	CPWAIT_AND_RETURN(r0)
 END(xscale_cache_flushD)
 
 ENTRY(xscale_cache_flushI_SE)
 	mcr	p15, 0, r0, c7, c5, 1	/* flush I cache single entry */
 	CPWAIT_AND_RETURN(r0)
 END(xscale_cache_flushI_SE)
 
 ENTRY(xscale_cache_flushD_SE)
 	/*
 	 * Errata (rev < 2): Must clean-dcache-line to an address
 	 * before invalidate-dcache-line to an address, or dirty
 	 * bits will not be cleared in the dcache array.
 	 */
 	mcr	p15, 0, r0, c7, c10, 1
 	mcr	p15, 0, r0, c7, c6, 1	/* flush D cache single entry */
 	CPWAIT_AND_RETURN(r0)
 END(xscale_cache_flushD_SE)
 
 ENTRY(xscale_cache_cleanD_E)
 	mcr	p15, 0, r0, c7, c10, 1	/* clean D cache entry */
 	CPWAIT_AND_RETURN(r0)
 END(xscale_cache_cleanD_E)
 
 /*
  * Information for the XScale cache clean/purge functions:
  *
  *	* Virtual address of the memory region to use
  *	* Size of memory region
  *
  * Note the virtual address for the Data cache clean operation
  * does not need to be backed by physical memory, since no loads
  * will actually be performed by the allocate-line operation.
  *
  * Note that the Mini-Data cache MUST be cleaned by executing
  * loads from memory mapped into a region reserved exclusively
  * for cleaning of the Mini-Data cache.
  */
 	.data
 
 	.global	_C_LABEL(xscale_cache_clean_addr)
 _C_LABEL(xscale_cache_clean_addr):
 	.word	0x00000000
 
 	.global	_C_LABEL(xscale_cache_clean_size)
 _C_LABEL(xscale_cache_clean_size):
 	.word	DCACHE_SIZE
 
 	.global	_C_LABEL(xscale_minidata_clean_addr)
 _C_LABEL(xscale_minidata_clean_addr):
 	.word	0x00000000
 
 	.global	_C_LABEL(xscale_minidata_clean_size)
 _C_LABEL(xscale_minidata_clean_size):
 	.word	0x00000800
 
 	.text
 
 .Lxscale_cache_clean_addr:
 	.word	_C_LABEL(xscale_cache_clean_addr)
 .Lxscale_cache_clean_size:
 	.word	_C_LABEL(xscale_cache_clean_size)
 
 .Lxscale_minidata_clean_addr:
 	.word	_C_LABEL(xscale_minidata_clean_addr)
 .Lxscale_minidata_clean_size:
 	.word	_C_LABEL(xscale_minidata_clean_size)
 
 #ifdef CACHE_CLEAN_BLOCK_INTR
 #define	XSCALE_CACHE_CLEAN_BLOCK					\
 	mrs	r3, cpsr					;	\
 	orr	r0, r3, #(I32_bit | F32_bit)			;	\
 	msr	cpsr_fsxc, r0
 
 #define	XSCALE_CACHE_CLEAN_UNBLOCK					\
 	msr	cpsr_fsxc, r3
 #else
 #define	XSCALE_CACHE_CLEAN_BLOCK					\
 	ldr	r3, .Lblock_userspace_access			;	\
 	ldr	ip, [r3]					;	\
 	orr	r0, ip, #1					;	\
 	str	r0, [r3]
 
 #define	XSCALE_CACHE_CLEAN_UNBLOCK					\
 	str	ip, [r3]
 #endif /* CACHE_CLEAN_BLOCK_INTR */
 
 #define	XSCALE_CACHE_CLEAN_PROLOGUE					\
 	XSCALE_CACHE_CLEAN_BLOCK				;	\
 	ldr	r2, .Lxscale_cache_clean_addr			;	\
 	ldmia	r2, {r0, r1}					;	\
 	/*								\
 	 * BUG ALERT!							\
 	 *								\
 	 * The XScale core has a strange cache eviction bug, which	\
 	 * requires us to use 2x the cache size for the cache clean	\
 	 * and for that area to be aligned to 2 * cache size.		\
 	 *								\
 	 * The work-around is to use 2 areas for cache clean, and to	\
 	 * alternate between them whenever this is done.  No one knows	\
 	 * why the work-around works (mmm!).				\
 	 */								\
 	eor	r0, r0, #(DCACHE_SIZE)				;	\
 	str	r0, [r2]					;	\
 	add	r0, r0, r1
 
 #define	XSCALE_CACHE_CLEAN_EPILOGUE					\
 	XSCALE_CACHE_CLEAN_UNBLOCK
 
 ENTRY_NP(xscale_cache_syncI)
-ENTRY_NP(xscale_cache_purgeID)
+
+EENTRY_NP(xscale_cache_purgeID)
 	mcr	p15, 0, r0, c7, c5, 0	/* flush I cache (D cleaned below) */
-ENTRY_NP(xscale_cache_cleanID)
-ENTRY_NP(xscale_cache_purgeD)
-ENTRY(xscale_cache_cleanD)
+EENTRY_NP(xscale_cache_cleanID)
+EENTRY_NP(xscale_cache_purgeD)
+EENTRY(xscale_cache_cleanD)
 	XSCALE_CACHE_CLEAN_PROLOGUE
 
 1:	subs	r0, r0, #32
 	mcr	p15, 0, r0, c7, c2, 5	/* allocate cache line */
 	subs	r1, r1, #32
 	bne	1b
 
 	CPWAIT(r0)
 
 	mcr	p15, 0, r0, c7, c10, 4	/* drain write buffer */
 
 	CPWAIT(r0)
 
 	XSCALE_CACHE_CLEAN_EPILOGUE
 	RET
+EEND(xscale_cache_cleanD)
+EEND(xscale_cache_purgeD)
+EEND(xscale_cache_cleanID)
+EEND(xscale_cache_purgeID)
 END(xscale_cache_syncI)
-END(xscale_cache_purgeID)
-END(xscale_cache_cleanID)
-END(xscale_cache_purgeD)
-END(xscale_cache_cleanD)
 
 /*
  * Clean the mini-data cache.
  *
  * It's expected that we only use the mini-data cache for
  * kernel addresses, so there is no need to purge it on
  * context switch, and no need to prevent userspace access
  * while we clean it.
  */
 ENTRY(xscale_cache_clean_minidata)
 	ldr	r2, .Lxscale_minidata_clean_addr
 	ldmia	r2, {r0, r1}
 1:	ldr	r3, [r0], #32
 	subs	r1, r1, #32
 	bne	1b
 
 	mcr	p15, 0, r0, c7, c10, 4	/* drain write buffer */
 
 	CPWAIT_AND_RETURN(r1)
 END(xscale_cache_clean_minidata)
 
 ENTRY(xscale_cache_purgeID_E)
 	mcr	p15, 0, r0, c7, c10, 1	/* clean D cache entry */
 	CPWAIT(r1)
 	mcr	p15, 0, r0, c7, c10, 4	/* drain write buffer */
 	mcr	p15, 0, r0, c7, c5, 1	/* flush I cache single entry */
 	mcr	p15, 0, r0, c7, c6, 1	/* flush D cache single entry */
 	CPWAIT_AND_RETURN(r1)
 END(xscale_cache_purgeID_E)
 
 ENTRY(xscale_cache_purgeD_E)
 	mcr	p15, 0, r0, c7, c10, 1	/* clean D cache entry */
 	CPWAIT(r1)
 	mcr	p15, 0, r0, c7, c10, 4	/* drain write buffer */
 	mcr	p15, 0, r0, c7, c6, 1	/* flush D cache single entry */
 	CPWAIT_AND_RETURN(r1)
 END(xscale_cache_purgeD_E)
 
 /*
  * Soft functions
  */
 /* xscale_cache_syncI is identical to xscale_cache_purgeID */
 
-ENTRY(xscale_cache_cleanID_rng)
+EENTRY(xscale_cache_cleanID_rng)
 ENTRY(xscale_cache_cleanD_rng)
 	cmp	r1, #0x4000
 	bcs	_C_LABEL(xscale_cache_cleanID)
 
 	and	r2, r0, #0x1f
 	add	r1, r1, r2
 	bic	r0, r0, #0x1f
 
 1:	mcr	p15, 0, r0, c7, c10, 1	/* clean D cache entry */
 	add	r0, r0, #32
 	subs	r1, r1, #32
 	bhi	1b
 
 	CPWAIT(r0)
 
 	mcr	p15, 0, r0, c7, c10, 4	/* drain write buffer */
 
 	CPWAIT_AND_RETURN(r0)
-END(xscale_cache_cleanID_rng)
+/*END(xscale_cache_cleanID_rng)*/
 END(xscale_cache_cleanD_rng)
 
 ENTRY(xscale_cache_purgeID_rng)
 	cmp	r1, #0x4000
 	bcs	_C_LABEL(xscale_cache_purgeID)
 
 	and	r2, r0, #0x1f
 	add	r1, r1, r2
 	bic	r0, r0, #0x1f
 
 1:	mcr	p15, 0, r0, c7, c10, 1	/* clean D cache entry */
 	mcr	p15, 0, r0, c7, c6, 1	/* flush D cache single entry */
 	mcr	p15, 0, r0, c7, c5, 1	/* flush I cache single entry */
 	add	r0, r0, #32
 	subs	r1, r1, #32
 	bhi	1b
 
 	CPWAIT(r0)
 
 	mcr	p15, 0, r0, c7, c10, 4	/* drain write buffer */
 
 	CPWAIT_AND_RETURN(r0)
 END(xscale_cache_purgeID_rng)
 
 ENTRY(xscale_cache_purgeD_rng)
 	cmp	r1, #0x4000
 	bcs	_C_LABEL(xscale_cache_purgeD)
 
 	and	r2, r0, #0x1f
 	add	r1, r1, r2
 	bic	r0, r0, #0x1f
 
 1:	mcr	p15, 0, r0, c7, c10, 1	/* clean D cache entry */
 	mcr	p15, 0, r0, c7, c6, 1	/* flush D cache single entry */
 	add	r0, r0, #32
 	subs	r1, r1, #32
 	bhi	1b
 
 	CPWAIT(r0)
 
 	mcr	p15, 0, r0, c7, c10, 4	/* drain write buffer */
 
 	CPWAIT_AND_RETURN(r0)
 END(xscale_cache_purgeD_rng)
 
 ENTRY(xscale_cache_syncI_rng)
 	cmp	r1, #0x4000
 	bcs	_C_LABEL(xscale_cache_syncI)
 
 	and	r2, r0, #0x1f
 	add	r1, r1, r2
 	bic	r0, r0, #0x1f
 
 1:	mcr	p15, 0, r0, c7, c10, 1	/* clean D cache entry */
 	mcr	p15, 0, r0, c7, c5, 1	/* flush I cache single entry */
 	add	r0, r0, #32
 	subs	r1, r1, #32
 	bhi	1b
 
 	CPWAIT(r0)
 
 	mcr	p15, 0, r0, c7, c10, 4	/* drain write buffer */
 
 	CPWAIT_AND_RETURN(r0)
 END(xscale_cache_syncI_rng)
 
 ENTRY(xscale_cache_flushD_rng)
 	and	r2, r0, #0x1f
 	add	r1, r1, r2
 	bic	r0, r0, #0x1f
 
 1:	mcr	p15, 0, r0, c7, c6, 1	/* flush D cache single entry */
 	add	r0, r0, #32
 	subs	r1, r1, #32
 	bhi	1b
 
 	mcr	p15, 0, r0, c7, c10, 4	/* drain write buffer */
 
 	CPWAIT_AND_RETURN(r0)
 END(xscale_cache_flushD_rng)
 
 /*
  * Context switch.
  *
  * These is the CPU-specific parts of the context switcher cpu_switch()
  * These functions actually perform the TTB reload.
  *
  * NOTE: Special calling convention
  *	r1, r4-r13 must be preserved
  */
 ENTRY(xscale_context_switch)
 	/*
 	 * CF_CACHE_PURGE_ID will *ALWAYS* be called prior to this.
 	 * Thus the data cache will contain only kernel data and the
 	 * instruction cache will contain only kernel code, and all
 	 * kernel mappings are shared by all processes.
 	 */
 
 	/* Write the TTB */
 	mcr	p15, 0, r0, c2, c0, 0
 
 	/* If we have updated the TTB we must flush the TLB */
 	mcr	p15, 0, r0, c8, c7, 0	/* flush the I+D tlb */
 
 	CPWAIT_AND_RETURN(r0)
 END(xscale_context_switch)
 
 /*
  * xscale_cpu_sleep
  *
  * This is called when there is nothing on any of the run queues.
  * We go into IDLE mode so that any IRQ or FIQ will awaken us.
  *
  * If this is called with anything other than ARM_SLEEP_MODE_IDLE,
  * ignore it.
  */
 ENTRY(xscale_cpu_sleep)
 	tst	r0, #0x00000000
 	bne	1f
 	mov	r0, #0x1
 	mcr	p14, 0, r0, c7, c0, 0
 
 1:
 	RET
 END(xscale_cpu_sleep)
 
Index: stable/10/sys/arm/arm/cpufunc_asm_xscale_c3.S
===================================================================
--- stable/10/sys/arm/arm/cpufunc_asm_xscale_c3.S	(revision 269795)
+++ stable/10/sys/arm/arm/cpufunc_asm_xscale_c3.S	(revision 269796)
@@ -1,415 +1,416 @@
 /*	$NetBSD: cpufunc_asm_xscale.S,v 1.16 2002/08/17 16:36:32 thorpej Exp $	*/
 
 /*-
  * Copyright (c) 2007 Olivier Houchard
  * Copyright (c) 2001, 2002 Wasabi Systems, Inc.
  * All rights reserved.
  *
  * Written by Allen Briggs and Jason R. Thorpe for Wasabi Systems, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed for the NetBSD Project by
  *	Wasabi Systems, Inc.
  * 4. The name of Wasabi Systems, Inc. may not be used to endorse
  *    or promote products derived from this software without specific prior
  *    written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  *
  */
 
 /*-
  * Copyright (c) 2001 Matt Thomas.
  * Copyright (c) 1997,1998 Mark Brinicombe.
  * Copyright (c) 1997 Causality Limited
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by Causality Limited.
  * 4. The name of Causality Limited may not be used to endorse or promote
  *    products derived from this software without specific prior written
  *    permission.
  *
  * THIS SOFTWARE IS PROVIDED BY CAUSALITY LIMITED ``AS IS'' AND ANY EXPRESS
  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  * DISCLAIMED. IN NO EVENT SHALL CAUSALITY LIMITED BE LIABLE FOR ANY DIRECT,
  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * XScale core 3 assembly functions for CPU / MMU / TLB specific operations
  */
 
 #include <machine/asm.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * Size of the XScale core D-cache.
  */
 #define	DCACHE_SIZE		0x00008000
 
 .Lblock_userspace_access:
 	.word	_C_LABEL(block_userspace_access)
 
 /*
  * CPWAIT -- Canonical method to wait for CP15 update.
  * From: Intel 80200 manual, section 2.3.3.
  *
  * NOTE: Clobbers the specified temp reg.
  */
 #define	CPWAIT_BRANCH							 \
 	sub	pc, pc, #4
 
 #define	CPWAIT(tmp)							 \
 	mrc	p15, 0, tmp, c2, c0, 0	/* arbitrary read of CP15 */	;\
 	mov	tmp, tmp		/* wait for it to complete */	;\
 	CPWAIT_BRANCH			/* branch to next insn */
 
 #define	CPWAIT_AND_RETURN_SHIFTER	lsr #32
 
 #define	CPWAIT_AND_RETURN(tmp)						 \
 	mrc	p15, 0, tmp, c2, c0, 0	/* arbitrary read of CP15 */	;\
 	/* Wait for it to complete and branch to the return address */	 \
 	sub	pc, lr, tmp, CPWAIT_AND_RETURN_SHIFTER
 
 #define ARM_USE_L2_CACHE
 
 #define L2_CACHE_SIZE		0x80000
 #define L2_CACHE_WAYS		8
 #define L2_CACHE_LINE_SIZE	32
 #define L2_CACHE_SETS		(L2_CACHE_SIZE / \
     (L2_CACHE_WAYS * L2_CACHE_LINE_SIZE))
 
 #define L1_DCACHE_SIZE		32 * 1024
 #define L1_DCACHE_WAYS		4
 #define L1_DCACHE_LINE_SIZE	32
 #define L1_DCACHE_SETS		(L1_DCACHE_SIZE / \
     (L1_DCACHE_WAYS * L1_DCACHE_LINE_SIZE))
 #ifdef CACHE_CLEAN_BLOCK_INTR
 #define	XSCALE_CACHE_CLEAN_BLOCK					\
 	stmfd	sp!, {r4}					;	\
 	mrs	r4, cpsr					;	\
 	orr	r0, r4, #(I32_bit | F32_bit)			;	\
 	msr	cpsr_fsxc, r0
 
 #define	XSCALE_CACHE_CLEAN_UNBLOCK					\
 	msr	cpsr_fsxc, r4					;	\
 	ldmfd	sp!, {r4}
 #else
 #define	XSCALE_CACHE_CLEAN_BLOCK					\
 	stmfd	sp!, {r4}					;	\
 	ldr	r4, .Lblock_userspace_access			;	\
 	ldr	ip, [r4]					;	\
 	orr	r0, ip, #1					;	\
 	str	r0, [r4]	
 
 #define	XSCALE_CACHE_CLEAN_UNBLOCK					\
 	str	ip, [r3]					;	\
 	ldmfd	sp!, {r4}
 #endif /* CACHE_CLEAN_BLOCK_INTR */
 
 
 ENTRY_NP(xscalec3_cache_syncI)
-ENTRY_NP(xscalec3_cache_purgeID)
+xscalec3_cache_purgeID:
+EENTRY_NP(xscalec3_cache_purgeID)
 	mcr	p15, 0, r0, c7, c5, 0	/* flush I cache (D cleaned below) */
-ENTRY_NP(xscalec3_cache_cleanID)
-ENTRY_NP(xscalec3_cache_purgeD)
-ENTRY(xscalec3_cache_cleanD)
+EENTRY_NP(xscalec3_cache_cleanID)
+EENTRY_NP(xscalec3_cache_purgeD)
+EENTRY(xscalec3_cache_cleanD)
 
 	XSCALE_CACHE_CLEAN_BLOCK
 	mov	r0, #0
 1:
 	mov	r1, r0, asl #30
 	mov	r2, #0
 2:
 	orr	r3, r1, r2, asl #5
 	mcr	p15, 0, r3, c7, c14, 2	/* clean and invalidate */
 	add	r2, r2, #1
 	cmp	r2, #L1_DCACHE_SETS
 	bne	2b
 	add	r0, r0, #1
 	cmp	r0, #4
 	bne	1b
 	CPWAIT(r0)
 	XSCALE_CACHE_CLEAN_UNBLOCK
 	mcr	p15, 0, r0, c7, c10, 4	/* drain write buffer */
 
 	RET
+EEND(xscalec3_cache_purgeID)
+EEND(xscalec3_cache_cleanID)
+EEND(xscalec3_cache_purgeD)
+EEND(xscalec3_cache_cleanD)
 END(xscalec3_cache_syncI)
-END(xscalec3_cache_purgeID)
-END(xscalec3_cache_cleanID)
-END(xscalec3_cache_purgeD)
-END(xscalec3_cache_cleanD)
 
 ENTRY(xscalec3_cache_purgeID_rng)
 
 	cmp	r1, #0x4000
 	bcs	_C_LABEL(xscalec3_cache_cleanID)
 	and	r2, r0, #0x1f
 	add	r1, r1, r2
 	bic	r0, r0, #0x1f
 
 1:	mcr	p15, 0, r0, c7, c14, 1	/* clean/invalidate L1 D cache entry */
 	nop
 	mcr	p15, 0, r0, c7, c5, 1	/* flush I cache single entry */
 	add	r0, r0, #32
 	subs	r1, r1, #32
 	bhi	1b
 
 	CPWAIT(r0)
 
 	mcr	p15, 0, r0, c7, c10, 4	/* drain write buffer */
 
 	CPWAIT_AND_RETURN(r0)
 END(xscalec3_cache_purgeID_rng)
 
 ENTRY(xscalec3_cache_syncI_rng)
 	cmp	r1, #0x4000
 	bcs	_C_LABEL(xscalec3_cache_syncI)
 
 	and	r2, r0, #0x1f
 	add	r1, r1, r2
 	bic	r0, r0, #0x1f
 
 1:	mcr	p15, 0, r0, c7, c10, 1	/* clean D cache entry */
 	mcr	p15, 0, r0, c7, c5, 1	/* flush I cache single entry */
 	add	r0, r0, #32
 	subs	r1, r1, #32
 	bhi	1b
 
 	CPWAIT(r0)
 
 	mcr	p15, 0, r0, c7, c10, 4	/* drain write buffer */
 
 	CPWAIT_AND_RETURN(r0)
 END(xscalec3_cache_syncI_rng)
 	
 ENTRY(xscalec3_cache_purgeD_rng)
 
 	cmp	r1, #0x4000
 	bcs	_C_LABEL(xscalec3_cache_cleanID)
 	and	r2, r0, #0x1f
 	add	r1, r1, r2
 	bic	r0, r0, #0x1f
 
 1:	mcr	p15, 0, r0, c7, c14, 1	/* Clean and invalidate D cache entry */
 	add	r0, r0, #32
 	subs	r1, r1, #32
 	bhi	1b
 
 	CPWAIT(r0)
 
 	mcr	p15, 0, r0, c7, c10, 4	/* drain write buffer */
 
 	CPWAIT_AND_RETURN(r0)
 END(xscalec3_cache_purgeD_rng)
 
 ENTRY(xscalec3_cache_cleanID_rng)
-ENTRY(xscalec3_cache_cleanD_rng)
+EENTRY(xscalec3_cache_cleanD_rng)
 
 	cmp	r1, #0x4000
 	bcs	_C_LABEL(xscalec3_cache_cleanID)
 	and	r2, r0, #0x1f
 	add	r1, r1, r2
 	bic	r0, r0, #0x1f
 
 1:	mcr	p15, 0, r0, c7, c10, 1	/* clean L1 D cache entry */
 	nop
 	add	r0, r0, #32
 	subs	r1, r1, #32
 	bhi	1b
 
 	CPWAIT(r0)
 
 	mcr	p15, 0, r0, c7, c10, 4	/* drain write buffer */
 
 	CPWAIT_AND_RETURN(r0)
+EEND(xscalec3_cache_cleanD_rng)
 END(xscalec3_cache_cleanID_rng)
-END(xscalec3_cache_cleanD_rng)
 
 ENTRY(xscalec3_l2cache_purge)
 	/* Clean-up the L2 cache */
 	mcr	p15, 0, r0, c7, c10, 5	/* Data memory barrier */
 	mov	r0, #0
 1:
 	mov	r1, r0, asl #29
 	mov	r2, #0
 2:
 	orr	r3, r1, r2, asl #5
 	mcr	p15, 1, r3, c7, c15, 2
 	add	r2, r2, #1
 	cmp	r2, #L2_CACHE_SETS
 	bne	2b
 	add	r0, r0, #1
 	cmp	r0, #8
 	bne	1b
 	mcr	p15, 0, r0, c7, c10, 4		@ data write barrier
 
 	CPWAIT(r0)
 	mcr	p15, 0, r0, c7, c10, 5	/* Data memory barrier */
 	RET
 END(xscalec3_l2cache_purge)
 
 ENTRY(xscalec3_l2cache_clean_rng)
 	mcr	p15, 0, r0, c7, c10, 5	/* Data memory barrier */
 
 	and	r2, r0, #0x1f
 	add	r1, r1, r2
 	bic	r0, r0, #0x1f
 
 1:	mcr	p15, 1, r0, c7, c11, 1	/* Clean L2 D cache entry */
 	add	r0, r0, #32
 	subs	r1, r1, #32
 	bhi	1b
 
 
 	CPWAIT(r0)
 
 	mcr	p15, 0, r0, c7, c10, 4		@ data write barrier
 	mcr	p15, 0, r0, c7, c10, 5
 
 	CPWAIT_AND_RETURN(r0)
 END(xscalec3_l2cache_clean_rng)
 
 ENTRY(xscalec3_l2cache_purge_rng)
 
 	mcr	p15, 0, r0, c7, c10, 5	/* Data memory barrier */
 
 	and	r2, r0, #0x1f
 	add	r1, r1, r2
 	bic	r0, r0, #0x1f
 
 1:	mcr	p15, 1, r0, c7, c11, 1	/* Clean L2 D cache entry */
 	mcr	p15, 1, r0, c7, c7, 1   /* Invalidate L2 D cache entry */
 	add	r0, r0, #32
 	subs	r1, r1, #32
 	bhi	1b
 
 	mcr	p15, 0, r0, c7, c10, 4		@ data write barrier
 	mcr	p15, 0, r0, c7, c10, 5
 
 	CPWAIT_AND_RETURN(r0)
 END(xscalec3_l2cache_purge_rng)
 
 ENTRY(xscalec3_l2cache_flush_rng)
 	mcr	p15, 0, r0, c7, c10, 5	/* Data memory barrier */
 
 	and	r2, r0, #0x1f
 	add	r1, r1, r2
 	bic	r0, r0, #0x1f
 
 1:	mcr	p15, 1, r0, c7, c7, 1   /* Invalidate L2 cache line */
 	add	r0, r0, #32
 	subs	r1, r1, #32
 	bhi	1b
 	mcr	p15, 0, r0, c7, c10, 4		@ data write barrier
 	mcr	p15, 0, r0, c7, c10, 5
 	CPWAIT_AND_RETURN(r0)
 END(xscalec3_l2cache_flush_rng)
 
 /*
  * Functions to set the MMU Translation Table Base register
  *
  * We need to clean and flush the cache as it uses virtual
  * addresses that are about to change.
  */
 ENTRY(xscalec3_setttb)
 #ifdef CACHE_CLEAN_BLOCK_INTR
 	mrs	r3, cpsr
 	orr	r1, r3, #(I32_bit | F32_bit)
 	msr	cpsr_fsxc, r1
 #else
 	ldr	r3, .Lblock_userspace_access
 	ldr	r2, [r3]
 	orr	r1, r2, #1
 	str	r1, [r3]
 #endif
 	stmfd	sp!, {r0-r3, lr}
 	bl	_C_LABEL(xscalec3_cache_cleanID)
 	mcr	p15, 0, r0, c7, c5, 0	/* invalidate I$ and BTB */
 	mcr	p15, 0, r0, c7, c10, 4	/* drain write and fill buffer */
 
 	CPWAIT(r0)
 
 	ldmfd	sp!, {r0-r3, lr}
 
 #ifdef ARM_USE_L2_CACHE
 	orr	r0, r0, #0x18	/* cache the page table in L2 */
 #endif
 	/* Write the TTB */
 	mcr	p15, 0, r0, c2, c0, 0
 
 	/* If we have updated the TTB we must flush the TLB */
 	mcr	p15, 0, r0, c8, c7, 0	/* invalidate I+D TLB */
 
 	CPWAIT(r0)
 
 #ifdef CACHE_CLEAN_BLOCK_INTR
 	msr	cpsr_fsxc, r3
 #else
 	str	r2, [r3]
 #endif
 	RET
 END(xscalec3_setttb)
 
 /*
  * Context switch.
  *
  * These is the CPU-specific parts of the context switcher cpu_switch()
  * These functions actually perform the TTB reload.
  *
  * NOTE: Special calling convention
  *	r1, r4-r13 must be preserved
  */
 ENTRY(xscalec3_context_switch)
 	/*
 	 * CF_CACHE_PURGE_ID will *ALWAYS* be called prior to this.
 	 * Thus the data cache will contain only kernel data and the
 	 * instruction cache will contain only kernel code, and all
 	 * kernel mappings are shared by all processes.
 	 */
 #ifdef ARM_USE_L2_CACHE
 	orr	r0, r0, #0x18	/* Cache the page table in L2 */
 #endif
 	/* Write the TTB */
 	mcr	p15, 0, r0, c2, c0, 0
 
 	/* If we have updated the TTB we must flush the TLB */
 	mcr	p15, 0, r0, c8, c7, 0	/* flush the I+D tlb */
 
 	CPWAIT_AND_RETURN(r0)
 END(xscalec3_context_switch)
 
Index: stable/10/sys/arm/arm/exception.S
===================================================================
--- stable/10/sys/arm/arm/exception.S	(revision 269795)
+++ stable/10/sys/arm/arm/exception.S	(revision 269796)
@@ -1,463 +1,463 @@
 /*	$NetBSD: exception.S,v 1.13 2003/10/31 16:30:15 scw Exp $	*/
 
 /*-
  * Copyright (c) 1994-1997 Mark Brinicombe.
  * Copyright (c) 1994 Brini.
  * All rights reserved.
  *
  * This code is derived from software written for Brini by Mark Brinicombe
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by Brini.
  * 4. The name of the company nor the name of the author may be used to
  *    endorse or promote products derived from this software without specific
  *    prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY BRINI ``AS IS'' AND ANY EXPRESS OR IMPLIED
  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL BRINI OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * RiscBSD kernel project
  *
  * exception.S
  *
  * Low level handlers for exception vectors
  *
  * Created      : 24/09/94
  *
  * Based on kate/display/abort.s
  *
  */
 
 #include "assym.s"
 
 #include <machine/asm.h>
 #include <machine/armreg.h>
 #include <machine/asmacros.h>
 __FBSDID("$FreeBSD$");
 
 	.text	
 	.align	0
 
 /*
  * ASM macros for pushing and pulling trapframes from the stack
  *
  * These macros are used to handle the irqframe and trapframe structures
  * defined above.
  */
 
 /*
  * PUSHFRAME - macro to push a trap frame on the stack in the current mode
  * Since the current mode is used, the SVC lr field is not defined.
  *
  * NOTE: r13 and r14 are stored separately as a work around for the
  * SA110 rev 2 STM^ bug
  */
 #ifdef ARM_TP_ADDRESS
 #define PUSHFRAME							   \
 	sub	sp, sp, #4;		/* Align the stack */		   \
 	str	lr, [sp, #-4]!;		/* Push the return address */	   \
 	sub	sp, sp, #(4*17);	/* Adjust the stack pointer */	   \
 	stmia	sp, {r0-r12};		/* Push the user mode registers */ \
 	add	r0, sp, #(4*13);	/* Adjust the stack pointer */	   \
 	stmia	r0, {r13-r14}^;		/* Push the user mode registers */ \
 	mov	r0, r0;			/* NOP for previous instruction */ \
 	mrs	r0, spsr;		/* Put the SPSR on the stack */	   \
 	str	r0, [sp, #-4]!;						   \
 	ldr	r0, =ARM_RAS_START;					   \
 	mov	r1, #0;							   \
 	str	r1, [r0];						   \
 	mov	r1, #0xffffffff;					   \
 	str	r1, [r0, #4];
 #else
 #define PUSHFRAME							   \
 	sub	sp, sp, #4;		/* Align the stack */		   \
 	str	lr, [sp, #-4]!;		/* Push the return address */	   \
 	sub	sp, sp, #(4*17);	/* Adjust the stack pointer */	   \
 	stmia	sp, {r0-r12};		/* Push the user mode registers */ \
 	add	r0, sp, #(4*13);	/* Adjust the stack pointer */	   \
 	stmia	r0, {r13-r14}^;		/* Push the user mode registers */ \
 	mov	r0, r0;			/* NOP for previous instruction */ \
 	mrs	r0, spsr;		/* Put the SPSR on the stack */	   \
 	str	r0, [sp, #-4]!;
 #endif
 
 /*
  * PULLFRAME - macro to pull a trap frame from the stack in the current mode
  * Since the current mode is used, the SVC lr field is ignored.
  */
 
 #ifdef ARM_TP_ADDRESS
 #define PULLFRAME							   \
 	ldr	r0, [sp], #4;		/* Get the SPSR from stack */	   \
 	msr	spsr_fsxc, r0;						   \
 	ldmia	sp, {r0-r14}^;		/* Restore registers (usr mode) */ \
 	mov	r0, r0;			/* NOP for previous instruction */ \
 	add	sp, sp, #(4*17);	/* Adjust the stack pointer */	   \
  	ldr	lr, [sp], #4;		/* Pull the return address */	   \
 	add	sp, sp, #4		/* Align the stack */
 #else 
 #define PULLFRAME							   \
 	ldr	r0, [sp], #4	;	/* Get the SPSR from stack */	   \
 	msr	spsr_fsxc, r0;						   \
 	clrex;								   \
 	ldmia   sp, {r0-r14}^;		/* Restore registers (usr mode) */ \
 	mov	r0, r0;			/* NOP for previous instruction */ \
 	add	sp, sp, #(4*17);	/* Adjust the stack pointer */	   \
  	ldr	lr, [sp], #4;		/* Pull the return address */	   \
 	add	sp, sp, #4		/* Align the stack */
 #endif
 
 /*
  * PUSHFRAMEINSVC - macro to push a trap frame on the stack in SVC32 mode
  * This should only be used if the processor is not currently in SVC32
  * mode. The processor mode is switched to SVC mode and the trap frame is
  * stored. The SVC lr field is used to store the previous value of
  * lr in SVC mode.
  *
  * NOTE: r13 and r14 are stored separately as a work around for the
  * SA110 rev 2 STM^ bug
  */
 #ifdef ARM_TP_ADDRESS
 #define PUSHFRAMEINSVC							   \
 	stmdb	sp, {r0-r3};		/* Save 4 registers */		   \
 	mov	r0, lr;			/* Save xxx32 r14 */		   \
 	mov	r1, sp;			/* Save xxx32 sp */		   \
 	mrs	r3, spsr;		/* Save xxx32 spsr */		   \
 	mrs	r2, cpsr; 		/* Get the CPSR */		   \
 	bic	r2, r2, #(PSR_MODE);	/* Fix for SVC mode */		   \
 	orr	r2, r2, #(PSR_SVC32_MODE);				   \
 	msr	cpsr_c, r2;		/* Punch into SVC mode */	   \
 	mov	r2, sp;			/* Save	SVC sp */		   \
 	bic	sp, sp, #7;		/* Align sp to an 8-byte addrress */  \
 	sub	sp, sp, #4;		/* Pad trapframe to keep alignment */ \
 	str	r0, [sp, #-4]!;		/* Push return address */	   \
 	str	lr, [sp, #-4]!;		/* Push SVC lr */		   \
 	str	r2, [sp, #-4]!;		/* Push SVC sp */		   \
 	msr	spsr_fsxc, r3;		/* Restore correct spsr */	   \
 	ldmdb	r1, {r0-r3};		/* Restore 4 regs from xxx mode */ \
 	sub	sp, sp, #(4*15);	/* Adjust the stack pointer */	   \
 	stmia	sp, {r0-r12};		/* Push the user mode registers */ \
 	add	r0, sp, #(4*13);	/* Adjust the stack pointer */	   \
 	stmia	r0, {r13-r14}^;		/* Push the user mode registers */ \
 	mov	r0, r0;			/* NOP for previous instruction */ \
 	ldr	r5, =ARM_RAS_START;	/* Check if there's any RAS */	   \
 	ldr	r4, [r5, #4];		/* reset it to point at the     */ \
 	cmp	r4, #0xffffffff;	/* end of memory if necessary;  */ \
 	movne	r1, #0xffffffff;	/* leave value in r4 for later  */ \
 	strne	r1, [r5, #4];		/* comparision against PC.      */ \
 	ldr	r3, [r5];		/* Retrieve global RAS_START    */ \
 	cmp	r3, #0;			/* and reset it if non-zero.    */ \
 	movne	r1, #0;			/* If non-zero RAS_START and    */ \
 	strne	r1, [r5];		/* PC was lower than RAS_END,   */ \
 	ldrne	r1, [r0, #16];		/* adjust the saved PC so that  */ \
 	cmpne	r4, r1;			/* execution later resumes at   */ \
 	strhi	r3, [r0, #16];		/* the RAS_START location.      */ \
 	mrs	r0, spsr;						   \
 	str	r0, [sp, #-4]!
 #else
 #define PUSHFRAMEINSVC							   \
 	stmdb	sp, {r0-r3};		/* Save 4 registers */		   \
 	mov	r0, lr;			/* Save xxx32 r14 */		   \
 	mov	r1, sp;			/* Save xxx32 sp */		   \
 	mrs	r3, spsr;		/* Save xxx32 spsr */		   \
 	mrs	r2, cpsr;		/* Get the CPSR */		   \
 	bic	r2, r2, #(PSR_MODE);	/* Fix for SVC mode */		   \
 	orr	r2, r2, #(PSR_SVC32_MODE);				   \
 	msr	cpsr_c, r2;		/* Punch into SVC mode */	   \
 	mov	r2, sp;			/* Save	SVC sp */		   \
 	bic	sp, sp, #7;		/* Align sp to an 8-byte addrress */  \
 	sub	sp, sp, #4;		/* Pad trapframe to keep alignment */ \
 	str	r0, [sp, #-4]!;		/* Push return address */	   \
 	str	lr, [sp, #-4]!;		/* Push SVC lr */		   \
 	str	r2, [sp, #-4]!;		/* Push SVC sp */		   \
 	msr	spsr_fsxc, r3;		/* Restore correct spsr */	   \
 	ldmdb	r1, {r0-r3};		/* Restore 4 regs from xxx mode */ \
 	sub	sp, sp, #(4*15);	/* Adjust the stack pointer */	   \
 	stmia	sp, {r0-r12};		/* Push the user mode registers */ \
 	add	r0, sp, #(4*13);	/* Adjust the stack pointer */	   \
 	stmia	r0, {r13-r14}^;		/* Push the user mode registers */ \
 	mov	r0, r0;			/* NOP for previous instruction */ \
 	mrs	r0, spsr;		/* Put the SPSR on the stack */	   \
 	str	r0, [sp, #-4]!
 #endif
 
 /*
  * PULLFRAMEFROMSVCANDEXIT - macro to pull a trap frame from the stack
  * in SVC32 mode and restore the saved processor mode and PC.
  * This should be used when the SVC lr register needs to be restored on
  * exit.
  */
 
 #ifdef ARM_TP_ADDRESS
 #define PULLFRAMEFROMSVCANDEXIT						   \
 	ldr	r0, [sp], #4;		/* Get the SPSR from stack */	   \
 	msr	spsr_fsxc, r0;		/* restore SPSR */		   \
 	ldmia	sp, {r0-r14}^;		/* Restore registers (usr mode) */ \
 	mov	r0, r0;	  		/* NOP for previous instruction */ \
 	add	sp, sp, #(4*15);	/* Adjust the stack pointer */	   \
 	ldmia	sp, {sp, lr, pc}^	/* Restore lr and exit */
 #else 
 #define PULLFRAMEFROMSVCANDEXIT						   \
 	ldr	r0, [sp], #4;		/* Get the SPSR from stack */	   \
 	msr	spsr_fsxc, r0;		/* restore SPSR */		   \
 	clrex;								   \
 	ldmia	sp, {r0-r14}^;		/* Restore registers (usr mode) */ \
 	mov	r0, r0;	  		/* NOP for previous instruction */ \
 	add	sp, sp, #(4*15);	/* Adjust the stack pointer */	   \
 	ldmia	sp, {sp, lr, pc}^	/* Restore lr and exit */
 #endif
 
 #if defined(__ARM_EABI__)
 /*
  * Unwind hints so we can unwind past functions that use
  * PULLFRAMEFROMSVCANDEXIT. They are run in reverse order.
  * As the last thing we do is restore the stack pointer
  * we can ignore the padding at the end of struct trapframe.
  */
 #define	UNWINDSVCFRAME							   \
 	.save {r13-r15};		/* Restore sp, lr, pc */	   \
 	.pad #(2*4);			/* Skip user sp and lr */	   \
 	.save {r0-r12};			/* Restore r0-r12 */		   \
 	.pad #(4)			/* Skip spsr */
 #else
 #define	UNWINDSVCFRAME
 #endif
 
 #define	DO_AST								\
 	ldr	r0, [sp]		/* Get the SPSR from stack */	;\
 	mrs	r4, cpsr		/* save CPSR */			;\
 	orr	r1, r4, #(I32_bit|F32_bit)				;\
 	msr	cpsr_c, r1		/* Disable interrupts */	;\
 	and	r0, r0, #(PSR_MODE)	/* Returning to USR mode? */	;\
 	teq	r0, #(PSR_USR32_MODE)					;\
 	bne	2f			/* Nope, get out now */		;\
 	bic	r4, r4, #(I32_bit|F32_bit)				;\
 1:	GET_CURTHREAD_PTR(r5)						;\
 	ldr	r1, [r5, #(TD_FLAGS)]					;\
 	and	r1, r1, #(TDF_ASTPENDING|TDF_NEEDRESCHED)		;\
 	teq	r1, #0x00000000						;\
 	beq	2f			/* Nope. Just bail */		;\
 	msr	cpsr_c, r4		/* Restore interrupts */	;\
 	mov	r0, sp							;\
 	bl	_C_LABEL(ast)		/* ast(frame) */		;\
 	orr	r0, r4, #(I32_bit|F32_bit)				;\
 	msr	cpsr_c, r0						;\
 	b	1b							;\
 2:
 
 
 /*
  * Entry point for a Software Interrupt (SWI).
  *
  * The hardware switches to svc32 mode on a swi, so we're already on the
  * right stack; just build a trapframe and call the handler.
  */
 ASENTRY_NP(swi_entry)
 	PUSHFRAME			/* Build the trapframe on the */
 	mov	r0, sp			/* scv32 stack, pass it to the */
 	bl	_C_LABEL(swi_handler)	/* swi handler. */
 	/*
 	 * The fork_trampoline() code in swtch.S aranges for the MI fork_exit()
 	 * to return to swi_exit here, to return to userland.  The net effect is
 	 * that a newly created thread appears to return from a SWI just like
 	 * the parent thread that created it.
 	 */
-ASENTRY_NP(swi_exit)
+ASEENTRY_NP(swi_exit)
 	DO_AST				/* Handle pending signals. */
 	PULLFRAME			/* Deallocate trapframe. */
 	movs	pc, lr			/* Return to userland. */
 	STOP_UNWINDING			/* Don't unwind into user mode. */
-END(swi_exit)
+EEND(swi_exit)
 END(swi_entry)
 
 /*
  * Standard exception exit handler.
  *
  * This is used to return from all exceptions except SWI.  It uses DO_AST and
  * PULLFRAMEFROMSVCANDEXIT and can only be called if the exception entry code
  * used PUSHFRAMEINSVC.
  *
  * If the return is to user mode, this uses DO_AST to deliver any pending
  * signals and/or handle TDF_NEEDRESCHED first.
  */
 ASENTRY_NP(exception_exit)
 	DO_AST				/* Handle pending signals. */
 	PULLFRAMEFROMSVCANDEXIT		/* Return. */
 	UNWINDSVCFRAME			/* Special unwinding for exceptions. */
 END(exception_exit)
 
 /*
  * Entry point for a Prefetch Abort exception.
  *
  * The hardware switches to the abort mode stack; we switch to svc32 before
  * calling the handler, then return directly to the original mode/stack 
  * on exit (without transitioning back through the abort mode stack).
  */
 ASENTRY_NP(prefetch_abort_entry)
 #ifdef __XSCALE__
 	nop				/* Make absolutely sure any pending */
 	nop				/* imprecise aborts have occurred. */
 #endif
 	sub	lr, lr, #4		/* Adjust the lr. Transition to scv32 */
 	PUSHFRAMEINSVC			/* mode stack, build trapframe there. */
 	adr	lr, exception_exit	/* Return from handler via standard */
 	mov	r0, sp			/* exception exit routine.  Pass the */
 	b	prefetch_abort_handler	/* trapframe to the handler. */
 END(prefetch_abort_entry)
 
 /*
  * Entry point for a Data Abort exception.
  *
  * The hardware switches to the abort mode stack; we switch to svc32 before
  * calling the handler, then return directly to the original mode/stack 
  * on exit (without transitioning back through the abort mode stack).
  */
 ASENTRY_NP(data_abort_entry)
 #ifdef __XSCALE__
 	nop				/* Make absolutely sure any pending */
 	nop				/* imprecise aborts have occurred. */
 #endif
 	sub	lr, lr, #8		/* Adjust the lr. Transition to scv32 */
 	PUSHFRAMEINSVC			/* mode stack, build trapframe there. */
 	adr	lr, exception_exit	/* Return from handler via standard */
 	mov	r0, sp			/* exception exit routine.  Pass the */
 	b	data_abort_handler	/* trapframe to the handler. */
 END(data_abort_entry)
 
 /*
  * Entry point for an Undefined Instruction exception.
  *
  * The hardware switches to the undefined mode stack; we switch to svc32 before
  * calling the handler, then return directly to the original mode/stack 
  * on exit (without transitioning back through the undefined mode stack).
  */
 ASENTRY_NP(undefined_entry)
 	sub	lr, lr, #4		/* Adjust the lr. Transition to scv32 */
 	PUSHFRAMEINSVC			/* mode stack, build trapframe there. */
 	adr	lr, exception_exit      /* Return from handler via standard */
 	mov	r0, sp			/* exception exit routine.  Pass the */
 	b	undefinedinstruction	/* trapframe to the handler. */
 END(undefined_entry)
 
 /*
  * Entry point for a normal IRQ.
  *
  * The hardware switches to the IRQ mode stack; we switch to svc32 before
  * calling the handler, then return directly to the original mode/stack 
  * on exit (without transitioning back through the IRQ mode stack).
  */
 ASENTRY_NP(irq_entry)
 	sub	lr, lr, #4		/* Adjust the lr. Transition to scv32 */
 	PUSHFRAMEINSVC			/* mode stack, build trapframe there. */
 	adr	lr, exception_exit	/* Return from handler via standard */
 	mov	r0, sp			/* exception exit routine.  Pass the */
 	b	_C_LABEL(arm_irq_handler)/* trapframe to the handler. */
 END(irq_entry)                           
 
 /*
  * Entry point for an FIQ interrupt.
  *
  * We don't currently support FIQ handlers very much.  Something can 
  * install itself in the FIQ vector using code (that may or may not work
  * these days) in fiq.c.  If nobody does that and an FIQ happens, this
  * default handler just disables FIQs and otherwise ignores it.
  */
 ASENTRY_NP(fiq_entry)
 	mrs	r8, cpsr		/* FIQ handling isn't supported, */
 	bic	r8, #(F32_bit)		/* just disable FIQ and return.  */
 	msr	cpsr_c, r8		/* The r8 we trash here is the  */
 	subs	pc, lr, #4		/* banked FIQ-mode r8. */
 END(fiq_entry)
 
 /*
  * Entry point for an Address Exception exception.
  * This is an arm26 exception that should never happen.
  */
 ASENTRY_NP(addr_exception_entry)
 	mov	r3, lr
 	mrs	r2, spsr
 	mrs	r1, cpsr
 	adr	r0, Laddr_exception_msg
 	b	_C_LABEL(panic)
 Laddr_exception_msg:
 	.asciz	"Address Exception CPSR=0x%08x SPSR=0x%08x LR=0x%08x\n"
 	.balign	4
 END(addr_exception_entry)
 
 /*
  * Entry point for the system Reset vector.  
  * This should never happen, so panic.
  */
 ASENTRY_NP(reset_entry)
 	mov	r1, lr
 	adr	r0, Lreset_panicmsg
 	b	_C_LABEL(panic)
 	/* NOTREACHED */
 Lreset_panicmsg:
 	.asciz	"Reset vector called, LR = 0x%08x"
 	.balign	4
 END(reset_entry)
 
 /*
  * page0 and page0_data -- An image of the ARM vectors which is copied to
  * the ARM vectors page (high or low) as part of CPU initialization.  The
  * code that does the copy assumes that page0_data holds one 32-bit word
  * of data for each of the predefined ARM vectors.  It also assumes that
  * page0_data follows the vectors in page0, but other stuff can appear 
  * between the two.  We currently leave room between the two for some fiq 
  * handler code to be copied in.
  */
 	.global	_C_LABEL(page0), _C_LABEL(page0_data)
 
 _C_LABEL(page0):
 	ldr	pc, .Lreset_entry
 	ldr	pc, .Lundefined_entry
 	ldr	pc, .Lswi_entry
 	ldr	pc, .Lprefetch_abort_entry
 	ldr	pc, .Ldata_abort_entry
 	ldr	pc, .Laddr_exception_entry
 	ldr	pc, .Lirq_entry
 .fiqv:	ldr	pc, .Lfiq_entry
 	.space 256	/* room for some fiq handler code */
 
 _C_LABEL(page0_data):
 .Lreset_entry:		.word	reset_entry
 .Lundefined_entry:	.word	undefined_entry
 .Lswi_entry:		.word	swi_entry
 .Lprefetch_abort_entry:	.word	prefetch_abort_entry
 .Ldata_abort_entry:	.word	data_abort_entry
 .Laddr_exception_entry:	.word	addr_exception_entry
 .Lirq_entry:		.word	irq_entry
 .Lfiq_entry:		.word	fiq_entry
 
 /*
  * These items are used by the code in fiq.c to install what it calls the
  * "null" handler.  It's actually our default vector entry that just jumps
  * to the default handler which just disables FIQs and returns.
  */
 	.global _C_LABEL(fiq_nullhandler_code), _C_LABEL(fiq_nullhandler_size)
 
 _C_LABEL(fiq_nullhandler_code):
 	.word	.fiqv
 _C_LABEL(fiq_nullhandler_size):
 	.word	4
 
 
Index: stable/10/sys/arm/arm/fusu.S
===================================================================
--- stable/10/sys/arm/arm/fusu.S	(revision 269795)
+++ stable/10/sys/arm/arm/fusu.S	(revision 269796)
@@ -1,393 +1,392 @@
 /*	$NetBSD: fusu.S,v 1.10 2003/12/01 13:34:44 rearnsha Exp $	*/
 
 /*-
  * Copyright (c) 1996-1998 Mark Brinicombe.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by Mark Brinicombe
  * 4. The name of the company nor the name of the author may be used to
  *    endorse or promote products derived from this software without specific
  *    prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
 #include <machine/asm.h>
 #include <machine/armreg.h>
 #include "assym.s"
 __FBSDID("$FreeBSD$");
 
 #ifdef _ARM_ARCH_6
 #define GET_PCB(tmp) \
 	mrc p15, 0, tmp, c13, c0, 4; \
 	add	tmp, tmp, #(TD_PCB)
 #else
 .Lcurpcb:
 	.word	_C_LABEL(__pcpu) + PC_CURPCB
 #define GET_PCB(tmp) \
 	ldr	tmp, .Lcurpcb
 #endif
 
 /*
  * fuword(caddr_t uaddr);
  * Fetch an int from the user's address space.
  */
 
-ENTRY_NP(casuword32)
 ENTRY(casuword)
+EENTRY_NP(casuword32)
 	GET_PCB(r3)
 	ldr	r3, [r3]
 
 #ifdef DIAGNOSTIC
 	teq	r3, #0x00000000
 	beq	.Lfusupcbfault
 #endif
 	stmfd	sp!, {r4, r5}
 	adr	r4, .Lcasuwordfault
 	str	r4, [r3, #PCB_ONFAULT]
 #ifdef _ARM_ARCH_6
 1:    
 	cmp     r0, #KERNBASE
 	mvnhs   r0, #0
 	bhs     2f
 	
 	ldrex   r5, [r0]
 	cmp     r5, r1
 	movne   r0, r5
 	bne     2f
 	strex   r5, r2, [r0]
 	cmp     r5, #0
 	bne     1b
 #else
 	ldrt	r5, [r0]
 	cmp	r5, r1
 	movne	r0, r5
 	streqt	r2, [r0]
 #endif
 	moveq	r0, r1
 2:
 	ldmfd	sp!, {r4, r5}
 	mov	r1, #0x00000000
 	str	r1, [r3, #PCB_ONFAULT]
 	RET
-END(casuword32)
+EEND(casuword32)
 END(casuword)
 
 /*
  * Handle faults from casuword.  Clean up and return -1.
  */
 
 .Lcasuwordfault:
 	mov	r0, #0x00000000
 	str	r0, [r3, #PCB_ONFAULT]
 	mvn	r0, #0x00000000
 	ldmfd	sp!, {r4, r5}
 	RET	
 
 /*
  * fuword(caddr_t uaddr);
  * Fetch an int from the user's address space.
  */
 
-ENTRY_NP(fuword32)
 ENTRY(fuword)
+EENTRY_NP(fuword32)
 	GET_PCB(r2)
 	ldr	r2, [r2]
 
 #ifdef DIAGNOSTIC
 	teq	r2, #0x00000000
 	beq	.Lfusupcbfault
 #endif
 
 	adr	r1, .Lfusufault
 	str	r1, [r2, #PCB_ONFAULT]
 
 	ldrt	r3, [r0]
 
 	mov	r1, #0x00000000
 	str	r1, [r2, #PCB_ONFAULT]
 	mov	r0, r3
 	RET
 END(fuword32)
 END(fuword)
 
 /*
  * fusword(caddr_t uaddr);
  * Fetch a short from the user's address space.
  */
 
 ENTRY(fusword)
 	GET_PCB(r2)
 	ldr	r2, [r2]
 
 #ifdef DIAGNOSTIC
 	teq	r2, #0x00000000
 	beq	.Lfusupcbfault
 #endif
 
 	adr	r1, .Lfusufault
 	str	r1, [r2, #PCB_ONFAULT]
 
 	ldrbt	r3, [r0], #1
 	ldrbt	ip, [r0]
 #ifdef __ARMEB__
 	orr	r0, ip, r3, asl #8
 #else
 	orr	r0, r3, ip, asl #8
 #endif
 	mov	r1, #0x00000000
 	str	r1, [r2, #PCB_ONFAULT]
 	RET
 END(fusword)
 
 /*
  * fuswintr(caddr_t uaddr);
  * Fetch a short from the user's address space.  Can be called during an
  * interrupt.
  */
 
 ENTRY(fuswintr)
 	ldr	r2, Lblock_userspace_access
 	ldr	r2, [r2]
 	teq	r2, #0
 	mvnne	r0, #0x00000000
 	RETne
 
 	GET_PCB(r2)
 	ldr	r2, [r2]
 
 #ifdef DIAGNOSTIC
 	teq	r2, #0x00000000
 	beq	.Lfusupcbfault
 #endif
 
 	adr	r1, _C_LABEL(fusubailout)
 	str	r1, [r2, #PCB_ONFAULT]
 
 	ldrbt	r3, [r0], #1
 	ldrbt	ip, [r0]
 #ifdef __ARMEB__
 	orr	r0, ip, r3, asl #8
 #else
 	orr	r0, r3, ip, asl #8
 #endif
 
 	mov	r1, #0x00000000
 	str	r1, [r2, #PCB_ONFAULT]
 	RET
 END(fuswintr)
 
 Lblock_userspace_access:
 	.word	_C_LABEL(block_userspace_access)
 
 	.data
 	.align	0
 	.global	_C_LABEL(block_userspace_access)
 _C_LABEL(block_userspace_access):
 	.word	0
 	.text
 
 /*
  * fubyte(caddr_t uaddr);
  * Fetch a byte from the user's address space.
  */
 
 ENTRY(fubyte)
 	GET_PCB(r2)
 	ldr	r2, [r2]
 
 #ifdef DIAGNOSTIC
 	teq	r2, #0x00000000
 	beq	.Lfusupcbfault
 #endif
 
 	adr	r1, .Lfusufault
 	str	r1, [r2, #PCB_ONFAULT]
 
 	ldrbt	r3, [r0]
 
 	mov	r1, #0x00000000
 	str	r1, [r2, #PCB_ONFAULT]
 	mov	r0, r3
 	RET
 END(fubyte)
 
 /*
  * Handle faults from [fs]u*().  Clean up and return -1.
  */
 
 .Lfusufault:
 	mov	r0, #0x00000000
 	str	r0, [r2, #PCB_ONFAULT]
 	mvn	r0, #0x00000000
 	RET
 
 /*
  * Handle faults from [fs]u*().  Clean up and return -1.  This differs from
  * fusufault() in that trap() will recognise it and return immediately rather
  * than trying to page fault.
  */
 
 /* label must be global as fault.c references it */
 	.global	_C_LABEL(fusubailout)
 _C_LABEL(fusubailout):
 	mov	r0, #0x00000000
 	str	r0, [r2, #PCB_ONFAULT]
 	mvn	r0, #0x00000000
 	RET
 
 #ifdef DIAGNOSTIC
 /*
  * Handle earlier faults from [fs]u*(), due to no pcb
  */
 
 .Lfusupcbfault:
 	mov	r1, r0
 	adr	r0, fusupcbfaulttext
 	b	_C_LABEL(panic)
 
 fusupcbfaulttext:
 	.asciz	"Yikes - no valid PCB during fusuxxx() addr=%08x\n"
 	.align	0
 #endif
 
 /*
  * suword(caddr_t uaddr, int x);
  * Store an int in the user's address space.
  */
 
-ENTRY_NP(suword32)
 ENTRY(suword)
+EENTRY_NP(suword32)
 	GET_PCB(r2)
 	ldr	r2, [r2]
 
 #ifdef DIAGNOSTIC
 	teq	r2, #0x00000000
 	beq	.Lfusupcbfault
 #endif
 
 	adr	r3, .Lfusufault
 	str	r3, [r2, #PCB_ONFAULT]
 
 	strt	r1, [r0]
 
 	mov	r0, #0x00000000
 	str	r0, [r2, #PCB_ONFAULT]
 	RET
 END(suword32)
 END(suword)
 
 /*
  * suswintr(caddr_t uaddr, short x);
  * Store a short in the user's address space.  Can be called during an
  * interrupt.
  */
 
 ENTRY(suswintr)
 	ldr	r2, Lblock_userspace_access
 	ldr	r2, [r2]
 	teq	r2, #0
 	mvnne	r0, #0x00000000
 	RETne
 
 	GET_PCB(r2)
 	ldr	r2, [r2]
 
 #ifdef DIAGNOSTIC
 	teq	r2, #0x00000000
 	beq	.Lfusupcbfault
 #endif
 
 	adr	r3, _C_LABEL(fusubailout)
 	str	r3, [r2, #PCB_ONFAULT]
 
 #ifdef __ARMEB__
 	mov	ip, r1, lsr #8
 	strbt	ip, [r0], #1
 #else
 	strbt	r1, [r0], #1
 	mov	r1, r1, lsr #8
 #endif
 	strbt	r1, [r0]
 
 	mov	r0, #0x00000000
 	str	r0, [r2, #PCB_ONFAULT]
 	RET
 END(suswintr)
 
 /*
  * susword(caddr_t uaddr, short x);
  * Store a short in the user's address space.
  */
 
 ENTRY(susword)
 	GET_PCB(r2)
 	ldr	r2, [r2]
 
 #ifdef DIAGNOSTIC
 	teq	r2, #0x00000000
 	beq	.Lfusupcbfault
 #endif
 
 	adr	r3, .Lfusufault
 	str	r3, [r2, #PCB_ONFAULT]
 
 #ifdef __ARMEB__
 	mov	ip, r1, lsr #8
 	strbt	ip, [r0], #1
 #else
 	strbt	r1, [r0], #1
 	mov	r1, r1, lsr #8
 #endif
 	strbt	r1, [r0]
 
 	mov	r0, #0x00000000
 	str	r0, [r2, #PCB_ONFAULT]
 	RET
 END(susword)
 
 /*
  * subyte(caddr_t uaddr, char x);
  * Store a byte in the user's address space.
  */
 
 ENTRY(subyte)
 	GET_PCB(r2)
 	ldr	r2, [r2]
 
 
 #ifdef DIAGNOSTIC
 	teq	r2, #0x00000000
 	beq	.Lfusupcbfault
 #endif
 
 	adr	r3, .Lfusufault
 	str	r3, [r2, #PCB_ONFAULT]
 
 	strbt	r1, [r0]
 	mov	r0, #0x00000000
 	str	r0, [r2, #PCB_ONFAULT]
 	RET
 END(subyte)
-
Index: stable/10/sys/arm/arm/locore.S
===================================================================
--- stable/10/sys/arm/arm/locore.S	(revision 269795)
+++ stable/10/sys/arm/arm/locore.S	(revision 269796)
@@ -1,540 +1,540 @@
 /*	$NetBSD: locore.S,v 1.14 2003/04/20 16:21:40 thorpej Exp $	*/
 
 /*-
  * Copyright 2011 Semihalf
  * Copyright (C) 1994-1997 Mark Brinicombe
  * Copyright (C) 1994 Brini
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by Brini.
  * 4. The name of Brini may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY BRINI ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL BRINI BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  */
 
 #include "assym.s"
 #include <sys/syscall.h>
 #include <machine/asm.h>
 #include <machine/armreg.h>
 #include <machine/pte.h>
 
 __FBSDID("$FreeBSD$");
 
 /* What size should this really be ? It is only used by initarm() */
 #define INIT_ARM_STACK_SIZE	(2048 * 4)
 
 #define	CPWAIT_BRANCH							 \
 	sub	pc, pc, #4
 
 #define	CPWAIT(tmp)							 \
 	mrc	p15, 0, tmp, c2, c0, 0	/* arbitrary read of CP15 */	;\
 	mov	tmp, tmp		/* wait for it to complete */	;\
 	CPWAIT_BRANCH			/* branch to next insn */
 
 /*
  * This is for kvm_mkdb, and should be the address of the beginning
  * of the kernel text segment (not necessarily the same as kernbase).
  */
 	.text
 	.align	0
 .globl kernbase
 .set kernbase,KERNBASE
 .globl physaddr
 .set physaddr,PHYSADDR
 
 /*
  * On entry for FreeBSD boot ABI:
  *	r0 - metadata pointer or 0 (boothowto on AT91's boot2)
  *	r1 - if (r0 == 0) then metadata pointer
  * On entry for Linux boot ABI:
  *	r0 - 0
  *	r1 - machine type (passed as arg2 to initarm)
  *	r2 - Pointer to a tagged list or dtb image (phys addr) (passed as arg1 initarm)
  *
  * For both types of boot we gather up the args, put them in a struct arm_boot_params
  * structure and pass that to initarm.
  */
-ENTRY_NP(btext)
+	.globl	btext
+btext:
 ASENTRY_NP(_start)
 	STOP_UNWINDING		/* Can't unwind into the bootloader! */
 
 	mov	r9, r0		/* 0 or boot mode from boot2 */
 	mov	r8, r1		/* Save Machine type */
 	mov	ip, r2		/* Save meta data */
 	mov	fp, r3		/* Future expantion */
 
 	/* Make sure interrupts are disabled. */
 	mrs	r7, cpsr
 	orr	r7, r7, #(I32_bit|F32_bit)
 	msr	cpsr_c, r7
 
 #if defined (FLASHADDR) && defined(LOADERRAMADDR)
 	/* Check if we're running from flash. */
 	ldr	r7, =FLASHADDR
 	/*
 	 * If we're running with MMU disabled, test against the
 	 * physical address instead.
 	 */
 	mrc     p15, 0, r2, c1, c0, 0
 	ands	r2, r2, #CPU_CONTROL_MMU_ENABLE
 	ldreq	r6, =PHYSADDR
 	ldrne	r6, =LOADERRAMADDR
 	cmp	r7, r6
 	bls 	flash_lower
 	cmp	r7, pc
 	bhi	from_ram
 	b	do_copy
 	
 flash_lower:
 	cmp	r6, pc
 	bls	from_ram
 do_copy:
 	ldr	r7, =KERNBASE
 	adr	r1, _start
 	ldr	r0, Lreal_start
 	ldr	r2, Lend
 	sub	r2, r2, r0
 	sub	r0, r0, r7
 	add	r0, r0, r6
 	mov	r4, r0
 	bl	memcpy
 	ldr	r0, Lram_offset
 	add	pc, r4, r0
 Lram_offset:	.word from_ram-_C_LABEL(_start)
 from_ram:
 	nop
 #endif
 	adr	r7, Lunmapped
 	bic     r7, r7, #0xf0000000
 	orr     r7, r7, #PHYSADDR
 
 
 disable_mmu:
 	/* Disable MMU for a while */
 	mrc     p15, 0, r2, c1, c0, 0
 	bic	r2, r2, #(CPU_CONTROL_MMU_ENABLE | CPU_CONTROL_DC_ENABLE |\
 	    CPU_CONTROL_WBUF_ENABLE)
 	bic	r2, r2, #(CPU_CONTROL_IC_ENABLE)
 	bic	r2, r2, #(CPU_CONTROL_BPRD_ENABLE)
 	mcr     p15, 0, r2, c1, c0, 0
 
 	nop
 	nop
 	nop
 	mov	pc, r7
 Lunmapped:
 	/*
 	 * Build page table from scratch.
 	 */
 
 	/* Find the delta between VA and PA */
 	adr	r0, Lpagetable
 	ldr	r1, [r0]
 	sub	r2, r1, r0
 	/* At this point: r2 = VA - PA */
 
 	/*
 	 * Find the physical address of the table. After these two
 	 * instructions:
 	 * r1 = va(pagetable)
 	 *
 	 * r0 = va(pagetable) - (VA - PA)
 	 *    = va(pagetable) - VA + PA
 	 *    = pa(pagetable)
 	 */
 	ldr	r1, [r0, #4]
 	sub	r0, r1, r2
 
 	/*
 	 * Map PA == VA
 	 */
 	/* Find the start kernels load address */
 	adr	r5, _start
 	ldr	r2, =(L1_S_OFFSET)
 	bic	r5, r2
 	mov	r1, r5
 	mov	r2, r5
 	/* Map 64MiB, preserved over calls to build_pagetables */
 	mov	r3, #64
 	bl	build_pagetables
 
 	/* Create the kernel map to jump to */
 	mov	r1, r5
 	ldr	r2, =(KERNVIRTADDR)
 	bl	build_pagetables
 	
 #if defined(SOCDEV_PA) && defined(SOCDEV_VA)
 	/* Create the custom map */
 	ldr	r1, =SOCDEV_PA
 	ldr	r2, =SOCDEV_VA
 	bl	build_pagetables
 #endif
 
 #if defined(SMP)
 	orr 	r0, r0, #2		/* Set TTB shared memory flag */
 #endif
 	mcr	p15, 0, r0, c2, c0, 0	/* Set TTB */
 	mcr	p15, 0, r0, c8, c7, 0	/* Flush TLB */
 
 #if defined(CPU_ARM1136) || defined(CPU_ARM1176) || defined(CPU_CORTEXA) || defined(CPU_MV_PJ4B) || defined(CPU_KRAIT)
 	mov	r0, #0
 	mcr	p15, 0, r0, c13, c0, 1	/* Set ASID to 0 */
 #endif
 
 	/* Set the Domain Access register.  Very important! */
 	mov     r0, #((DOMAIN_CLIENT << (PMAP_DOMAIN_KERNEL*2)) | DOMAIN_CLIENT)
 	mcr	p15, 0, r0, c3, c0, 0
 	/* 
 	 * Enable MMU.
 	 * On armv6 enable extended page tables, and set alignment checking
 	 * to modulo-4 (CPU_CONTROL_UNAL_ENABLE) for the ldrd/strd
 	 * instructions emitted by clang.
 	 */
 	mrc	p15, 0, r0, c1, c0, 0
 #ifdef _ARM_ARCH_6
 	orr	r0, r0, #(CPU_CONTROL_V6_EXTPAGE | CPU_CONTROL_UNAL_ENABLE)
 	orr	r0, r0, #(CPU_CONTROL_AFLT_ENABLE)
 	orr	r0, r0, #(CPU_CONTROL_AF_ENABLE)
 #endif
 	orr	r0, r0, #(CPU_CONTROL_MMU_ENABLE)
 	mcr	p15, 0, r0, c1, c0, 0
 	nop
 	nop
 	nop
 	CPWAIT(r0)
 
 mmu_done:
 	nop
 	adr	r1, .Lstart
 	ldmia	r1, {r1, r2, sp}	/* Set initial stack and */
 	sub	r2, r2, r1		/* get zero init data */
 	mov	r3, #0
 .L1:
 	str	r3, [r1], #0x0004	/* get zero init data */
 	subs	r2, r2, #4
 	bgt	.L1
 	ldr	pc, .Lvirt_done
 
 virt_done:
 	mov	r1, #28			/* loader info size is 28 bytes also second arg */
 	subs	sp, sp, r1		/* allocate arm_boot_params struct on stack */
 	mov	r0, sp			/* loader info pointer is first arg */
 	bic	sp, sp, #7		/* align stack to 8 bytes */
 	str	r1, [r0]		/* Store length of loader info */
 	str	r9, [r0, #4]		/* Store r0 from boot loader */
 	str	r8, [r0, #8]		/* Store r1 from boot loader */
 	str	ip, [r0, #12]		/* store r2 from boot loader */
 	str	fp, [r0, #16]		/* store r3 from boot loader */
 	str	r5, [r0, #20]		/* store the physical address */
 	adr	r4, Lpagetable		/* load the pagetable address */
 	ldr	r5, [r4, #4]
 	str	r5, [r0, #24]		/* store the pagetable address */
 	mov	fp, #0			/* trace back starts here */
 	bl	_C_LABEL(initarm)	/* Off we go */
 
 	/* init arm will return the new stack pointer. */
 	mov	sp, r0
 
 	bl	_C_LABEL(mi_startup)		/* call mi_startup()! */
 
 	adr	r0, .Lmainreturned
 	b	_C_LABEL(panic)
 	/* NOTREACHED */
-END(btext)
 END(_start)
 
 /*
  * Builds the page table
  * r0 - The table base address
  * r1 - The physical address (trashed)
  * r2 - The virtual address (trashed)
  * r3 - The number of 1MiB sections
  * r4 - Trashed
  *
  * Addresses must be 1MiB aligned
  */
 build_pagetables:
 	/* Set the required page attributed */
 	ldr	r4, =(L1_TYPE_S|L1_S_C|L1_S_AP(AP_KRW))
 #if defined(SMP)
 	orr	r4, #(L1_SHARED)
 #endif
 	orr	r1, r4
 
 	/* Move the virtual address to the correct bit location */
 	lsr	r2, #(L1_S_SHIFT - 2)
 
 	mov	r4, r3
 1:
 	str	r1, [r0, r2]
 	add	r2, r2, #4
 	add	r1, r1, #(L1_S_SIZE)
 	adds	r4, r4, #-1
 	bhi	1b
 
 	RET
 
 Lpagetable:
 	.word	.
 	.word	pagetable
 
 Lvirtaddr:
 	.word	KERNVIRTADDR
 Lphysaddr:
 	.word	KERNPHYSADDR
 Lreal_start:
 	.word	_start
 Lend:
 	.word	_edata
 
 .Lstart:
 	.word	_edata
 	.word	_ebss
 	.word	svcstk + INIT_ARM_STACK_SIZE
 
 .Lvirt_done:
 	.word	virt_done
 
 .Lmainreturned:
 	.asciz	"main() returned"
 	.align	0
 
 	.bss
 svcstk:
 	.space	INIT_ARM_STACK_SIZE
 
 /*
  * Memory for the initial pagetable. We are unable to place this in
  * the bss as this will be cleared after the table is loaded.
  */
 	.section ".init_pagetable"
 	.align	14 /* 16KiB aligned */
 pagetable:
 	.space	L1_TABLE_SIZE
 
 	.text
 	.align	0
 
 .Lcpufuncs:
 	.word	_C_LABEL(cpufuncs)
 
 #if defined(SMP)
 
 .Lmpvirt_done:
 	.word	mpvirt_done
 Lstartup_pagetable_secondary:
 	.word	temp_pagetable
 
 ASENTRY_NP(mpentry)
 
 	/* Make sure interrupts are disabled. */
 	mrs	r7, cpsr
 	orr	r7, r7, #(I32_bit|F32_bit)
 	msr	cpsr_c, r7
 
 	/* Disable MMU.  It should be disabled already, but make sure. */
 	mrc	p15, 0, r2, c1, c0, 0
 	bic	r2, r2, #(CPU_CONTROL_MMU_ENABLE | CPU_CONTROL_DC_ENABLE |\
 	    CPU_CONTROL_WBUF_ENABLE)
 	bic	r2, r2, #(CPU_CONTROL_IC_ENABLE)
 	bic	r2, r2, #(CPU_CONTROL_BPRD_ENABLE)
 	mcr	p15, 0, r2, c1, c0, 0
 	nop
 	nop
 	nop
 	CPWAIT(r0)
 
 #if defined(ARM_MMU_V6)
 	bl	armv6_idcache_inv_all	/* Modifies r0 only */
 #elif defined(ARM_MMU_V7)
 	bl	armv7_idcache_inv_all	/* Modifies r0-r3, ip */
 #endif
 
 	ldr	r0, Lstartup_pagetable_secondary
 	bic	r0, r0, #0xf0000000
 	orr	r0, r0, #PHYSADDR
 	ldr	r0, [r0]
 	orr 	r0, r0, #2		/* Set TTB shared memory flag */
 	mcr	p15, 0, r0, c2, c0, 0	/* Set TTB */
 	mcr	p15, 0, r0, c8, c7, 0	/* Flush TLB */
 
 	mov	r0, #0
 	mcr	p15, 0, r0, c13, c0, 1	/* Set ASID to 0 */
 
 	/* Set the Domain Access register.  Very important! */
 	mov	r0, #((DOMAIN_CLIENT << (PMAP_DOMAIN_KERNEL*2)) | DOMAIN_CLIENT)
 	mcr	p15, 0, r0, c3, c0, 0
 	/* Enable MMU */
 	mrc	p15, 0, r0, c1, c0, 0
 	orr	r0, r0, #CPU_CONTROL_V6_EXTPAGE
 	orr	r0, r0, #CPU_CONTROL_AF_ENABLE
 	orr	r0, r0, #(CPU_CONTROL_MMU_ENABLE | CPU_CONTROL_DC_ENABLE |\
 	    CPU_CONTROL_WBUF_ENABLE)
 	orr	r0, r0, #(CPU_CONTROL_IC_ENABLE)
 	orr	r0, r0, #(CPU_CONTROL_BPRD_ENABLE)
 	mcr	p15, 0, r0, c1, c0, 0
 	nop
 	nop
 	nop
 	CPWAIT(r0)
 
 	adr	r1, .Lstart
 	ldmia	r1, {r1, r2, sp}	/* Set initial stack and */
 	mrc	p15, 0, r0, c0, c0, 5
 	and	r0, r0, #15
 	mov	r1, #2048
 	mul	r2, r1, r0
 	sub	sp, sp, r2
 	str	r1, [sp]
 	ldr	pc, .Lmpvirt_done
 
 mpvirt_done:
 
 	mov	fp, #0			/* trace back starts here */
 	bl	_C_LABEL(init_secondary)	/* Off we go */
 
 	adr	r0, .Lmpreturned
 	b	_C_LABEL(panic)
 	/* NOTREACHED */
 
 .Lmpreturned:
 	.asciz	"init_secondary() returned"
 	.align	0
 END(mpentry)
 #endif
 
 ENTRY_NP(cpu_halt)
 	mrs     r2, cpsr
 	bic	r2, r2, #(PSR_MODE)
 	orr     r2, r2, #(PSR_SVC32_MODE)
 	orr	r2, r2, #(I32_bit | F32_bit)
 	msr     cpsr_fsxc, r2
 
 	ldr	r4, .Lcpu_reset_address
 	ldr	r4, [r4]
 
 	ldr	r0, .Lcpufuncs
 	mov	lr, pc
 	ldr	pc, [r0, #CF_IDCACHE_WBINV_ALL]
 	mov	lr, pc
 	ldr	pc, [r0, #CF_L2CACHE_WBINV_ALL]
 
 	/*
 	 * Load the cpu_reset_needs_v4_MMU_disable flag to determine if it's
 	 * necessary.
 	 */
 
 	ldr	r1, .Lcpu_reset_needs_v4_MMU_disable
 	ldr	r1, [r1]
 	cmp	r1, #0
 	mov	r2, #0
 
 	/*
 	 * MMU & IDC off, 32 bit program & data space
 	 * Hurl ourselves into the ROM
 	 */
 	mov	r0, #(CPU_CONTROL_32BP_ENABLE | CPU_CONTROL_32BD_ENABLE)
 	mcr     15, 0, r0, c1, c0, 0
 	mcrne   15, 0, r2, c8, c7, 0 	/* nail I+D TLB on ARMv4 and greater */
 	mov     pc, r4
 
 	/*
 	 * _cpu_reset_address contains the address to branch to, to complete
 	 * the cpu reset after turning the MMU off
 	 * This variable is provided by the hardware specific code
 	 */
 .Lcpu_reset_address:
 	.word	_C_LABEL(cpu_reset_address)
 
 	/*
 	 * cpu_reset_needs_v4_MMU_disable contains a flag that signals if the
 	 * v4 MMU disable instruction needs executing... it is an illegal instruction
 	 * on f.e. ARM6/7 that locks up the computer in an endless illegal
 	 * instruction / data-abort / reset loop.
 	 */
 .Lcpu_reset_needs_v4_MMU_disable:
 	.word	_C_LABEL(cpu_reset_needs_v4_MMU_disable)
 END(cpu_halt)
 
 
 /*
  * setjump + longjmp
  */
 ENTRY(setjmp)
 	stmia	r0, {r4-r14}
 	mov	r0, #0x00000000
 	RET
 END(setjmp)
 
 ENTRY(longjmp)
 	ldmia	r0, {r4-r14}
 	mov	r0, #0x00000001
 	RET
 END(longjmp)
 
 	.data
 	.global _C_LABEL(esym)
 _C_LABEL(esym):	.word	_C_LABEL(end)
 
 ENTRY_NP(abort)
 	b	_C_LABEL(abort)
 END(abort)
 
 ENTRY_NP(sigcode)
 	mov	r0, sp
 	add	r0, r0, #SIGF_UC
 
 	/*
 	 * Call the sigreturn system call.
 	 * 
 	 * We have to load r7 manually rather than using
 	 * "ldr r7, =SYS_sigreturn" to ensure the value of szsigcode is
 	 * correct. Using the alternative places esigcode at the address
 	 * of the data rather than the address one past the data.
 	 */
 
 	ldr	r7, [pc, #12]	/* Load SYS_sigreturn */
 	swi	SYS_sigreturn
 
 	/* Well if that failed we better exit quick ! */
 
 	ldr	r7, [pc, #8]	/* Load SYS_exit */
 	swi	SYS_exit
 
 	/* Branch back to retry SYS_sigreturn */
 	b	. - 16
-
+END(sigcode)
 	.word	SYS_sigreturn
 	.word	SYS_exit
 
 	.align	0
 	.global _C_LABEL(esigcode)
 		_C_LABEL(esigcode):
 
 	.data
 	.global szsigcode
 szsigcode:
 	.long esigcode-sigcode
-END(sigcode)
+
 /* End of locore.S */
Index: stable/10/sys/arm/arm/setstack.s
===================================================================
--- stable/10/sys/arm/arm/setstack.s	(revision 269795)
+++ stable/10/sys/arm/arm/setstack.s	(revision 269796)
@@ -1,94 +1,94 @@
 /*	$NetBSD: setstack.S,v 1.1 2001/07/28 13:28:03 chris Exp $	*/
 
 /*-
  * Copyright (c) 1994 Mark Brinicombe.
  * Copyright (c) 1994 Brini.
  * All rights reserved.
  *
  * This code is derived from software written for Brini by Mark Brinicombe
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by Brini.
  * 4. The name of the company nor the name of the author may be used to
  *    endorse or promote products derived from this software without specific
  *    prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY BRINI ``AS IS'' AND ANY EXPRESS OR IMPLIED
  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL BRINI OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * RiscBSD kernel project
  *
  * setstack.S
  *
  * Miscellaneous routine to play with the stack pointer in different CPU modes
  *
  * Eventually this routine can be inline assembly.
  *
  * Created      : 17/09/94
  *
  * Based of kate/display/setstack.s
  *
  */
 
 #include <machine/armreg.h>
 #include <machine/asm.h>
 __FBSDID("$FreeBSD$");
 
 /* To set the stack pointer for a particular mode we must switch
  * to that mode update the banked r13 and then switch back.
  * This routine provides an easy way of doing this for any mode
  *
  * r0 = CPU mode
  * r1 = stackptr
  */
 
 ENTRY(set_stackptr)
         mrs	r3, cpsr		/* Switch to the appropriate mode */
 	bic	r2, r3, #(PSR_MODE)
 	orr	r2, r2, r0
         msr	cpsr_fsxc, r2
 
 	mov	sp, r1			/* Set the stack pointer */
 
         msr	cpsr_fsxc, r3		/* Restore the old mode */
 
 	mov	pc, lr			/* Exit */
-
+END(set_stackptr)
 /* To get the stack pointer for a particular mode we must switch
  * to that mode copy the banked r13 and then switch back.
  * This routine provides an easy way of doing this for any mode
  *
  * r0 = CPU mode
  */
 
 ENTRY(get_stackptr)
         mrs	r3, cpsr		/* Switch to the appropriate mode */
 	bic	r2, r3, #(PSR_MODE)
 	orr	r2, r2, r0
         msr	cpsr_fsxc, r2
 
 	mov	r0, sp			/* Set the stack pointer */
 
         msr	cpsr_fsxc, r3		/* Restore the old mode */
 
 	mov	pc, lr			/* Exit */
-
+END(get_stackptr)
 /* End of setstack.S */
Index: stable/10/sys/arm/arm/support.S
===================================================================
--- stable/10/sys/arm/arm/support.S	(revision 269795)
+++ stable/10/sys/arm/arm/support.S	(revision 269796)
@@ -1,2957 +1,2960 @@
 /*-
  * Copyright (c) 2004 Olivier Houchard
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 /*
  * Copyright 2003 Wasabi Systems, Inc.
  * All rights reserved.
  *
  * Written by Steve C. Woodford for Wasabi Systems, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *      This product includes software developed for the NetBSD Project by
  *      Wasabi Systems, Inc.
  * 4. The name of Wasabi Systems, Inc. may not be used to endorse
  *    or promote products derived from this software without specific prior
  *    written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 /*
  * Copyright (c) 1997 The NetBSD Foundation, Inc.
  * All rights reserved.
  *
  * This code is derived from software contributed to The NetBSD Foundation
  * by Neil A. Carson and Mark Brinicombe
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <machine/asm.h>
 __FBSDID("$FreeBSD$");
 
 #include "assym.s"
 
 .L_arm_memcpy:
 	.word	_C_LABEL(_arm_memcpy)
 .L_arm_bzero:
 	.word	_C_LABEL(_arm_bzero)
 .L_min_memcpy_size:
 	.word	_C_LABEL(_min_memcpy_size)
 .L_min_bzero_size:
 	.word	_C_LABEL(_min_bzero_size)
 /*
  * memset: Sets a block of memory to the specified value
  *
  * On entry:
  *   r0 - dest address
  *   r1 - byte to write
  *   r2 - number of bytes to write
  *
  * On exit:
  *   r0 - dest address
  */
 /* LINTSTUB: Func: void bzero(void *, size_t) */
 ENTRY(bzero)
 	ldr	r3, .L_arm_bzero
 	ldr	r3, [r3]
 	cmp	r3, #0
 	beq	.Lnormal0
 	ldr	r2, .L_min_bzero_size
 	ldr	r2, [r2]
 	cmp	r1, r2
 	blt	.Lnormal0
 	stmfd	sp!, {r0, r1, lr}
 	mov	r2, #0
 	mov	lr, pc
 	mov	pc, r3
 	cmp	r0, #0
 	ldmfd	sp!, {r0, r1, lr}
 	RETeq
 .Lnormal0:
 	mov	r3, #0x00
 	b	do_memset
-
+EEND(bzero)
 /* LINTSTUB: Func: void *memset(void *, int, size_t) */
 ENTRY(memset)
 	and	r3, r1, #0xff		/* We deal with bytes */
 	mov	r1, r2
 do_memset:
 	cmp	r1, #0x04		/* Do we have less than 4 bytes */
 	mov	ip, r0
 	blt	.Lmemset_lessthanfour
 
 	/* Ok first we will word align the address */
 	ands	r2, ip, #0x03		/* Get the bottom two bits */
 	bne	.Lmemset_wordunaligned	/* The address is not word aligned */
 
 	/* We are now word aligned */
 .Lmemset_wordaligned:
 	orr	r3, r3, r3, lsl #8	/* Extend value to 16-bits */
 #ifdef _ARM_ARCH_5E
 	tst	ip, #0x04		/* Quad-align for armv5e */
 #else
 	cmp	r1, #0x10
 #endif
 	orr	r3, r3, r3, lsl #16	/* Extend value to 32-bits */
 #ifdef _ARM_ARCH_5E
 	subne	r1, r1, #0x04		/* Quad-align if necessary */
 	strne	r3, [ip], #0x04
 	cmp	r1, #0x10
 #endif
 	blt	.Lmemset_loop4		/* If less than 16 then use words */
 	mov	r2, r3			/* Duplicate data */
 	cmp	r1, #0x80		/* If < 128 then skip the big loop */
 	blt	.Lmemset_loop32
 
 	/* Do 128 bytes at a time */
 .Lmemset_loop128:
 	subs	r1, r1, #0x80
 #ifdef _ARM_ARCH_5E
 	strged	r2, [ip], #0x08
 	strged	r2, [ip], #0x08
 	strged	r2, [ip], #0x08
 	strged	r2, [ip], #0x08
 	strged	r2, [ip], #0x08
 	strged	r2, [ip], #0x08
 	strged	r2, [ip], #0x08
 	strged	r2, [ip], #0x08
 	strged	r2, [ip], #0x08
 	strged	r2, [ip], #0x08
 	strged	r2, [ip], #0x08
 	strged	r2, [ip], #0x08
 	strged	r2, [ip], #0x08
 	strged	r2, [ip], #0x08
 	strged	r2, [ip], #0x08
 	strged	r2, [ip], #0x08
 #else
 	stmgeia	ip!, {r2-r3}
 	stmgeia	ip!, {r2-r3}
 	stmgeia	ip!, {r2-r3}
 	stmgeia	ip!, {r2-r3}
 	stmgeia	ip!, {r2-r3}
 	stmgeia	ip!, {r2-r3}
 	stmgeia	ip!, {r2-r3}
 	stmgeia	ip!, {r2-r3}
 	stmgeia	ip!, {r2-r3}
 	stmgeia	ip!, {r2-r3}
 	stmgeia	ip!, {r2-r3}
 	stmgeia	ip!, {r2-r3}
 	stmgeia	ip!, {r2-r3}
 	stmgeia	ip!, {r2-r3}
 	stmgeia	ip!, {r2-r3}
 	stmgeia	ip!, {r2-r3}
 #endif
 	bgt	.Lmemset_loop128
 	RETeq			/* Zero length so just exit */
 
 	add	r1, r1, #0x80		/* Adjust for extra sub */
 
 	/* Do 32 bytes at a time */
 .Lmemset_loop32:
 	subs	r1, r1, #0x20
 #ifdef _ARM_ARCH_5E
 	strged	r2, [ip], #0x08
 	strged	r2, [ip], #0x08
 	strged	r2, [ip], #0x08
 	strged	r2, [ip], #0x08
 #else
 	stmgeia	ip!, {r2-r3}
 	stmgeia	ip!, {r2-r3}
 	stmgeia	ip!, {r2-r3}
 	stmgeia	ip!, {r2-r3}
 #endif
 	bgt	.Lmemset_loop32
 	RETeq			/* Zero length so just exit */
 
 	adds	r1, r1, #0x10		/* Partially adjust for extra sub */
 
 	/* Deal with 16 bytes or more */
 #ifdef _ARM_ARCH_5E
 	strged	r2, [ip], #0x08
 	strged	r2, [ip], #0x08
 #else
 	stmgeia	ip!, {r2-r3}
 	stmgeia	ip!, {r2-r3}
 #endif
 	RETeq			/* Zero length so just exit */
 
 	addlt	r1, r1, #0x10		/* Possibly adjust for extra sub */
 
 	/* We have at least 4 bytes so copy as words */
 .Lmemset_loop4:
 	subs	r1, r1, #0x04
 	strge	r3, [ip], #0x04
 	bgt	.Lmemset_loop4
 	RETeq			/* Zero length so just exit */
 
 #ifdef _ARM_ARCH_5E
 	/* Compensate for 64-bit alignment check */
 	adds	r1, r1, #0x04
 	RETeq
 	cmp	r1, #2
 #else
 	cmp	r1, #-2
 #endif
 
 	strb	r3, [ip], #0x01		/* Set 1 byte */
 	strgeb	r3, [ip], #0x01		/* Set another byte */
 	strgtb	r3, [ip]		/* and a third */
 	RET			/* Exit */
 
 .Lmemset_wordunaligned:
 	rsb	r2, r2, #0x004
 	strb	r3, [ip], #0x01		/* Set 1 byte */
 	cmp	r2, #0x02
 	strgeb	r3, [ip], #0x01		/* Set another byte */
 	sub	r1, r1, r2
 	strgtb	r3, [ip], #0x01		/* and a third */
 	cmp	r1, #0x04		/* More than 4 bytes left? */
 	bge	.Lmemset_wordaligned	/* Yup */
 
 .Lmemset_lessthanfour:
 	cmp	r1, #0x00
 	RETeq			/* Zero length so exit */
 	strb	r3, [ip], #0x01		/* Set 1 byte */
 	cmp	r1, #0x02
 	strgeb	r3, [ip], #0x01		/* Set another byte */
 	strgtb	r3, [ip]		/* and a third */
 	RET			/* Exit */
-END(bzero)
 END(memset)
 
 ENTRY(bcmp)
 	mov	ip, r0
 	cmp	r2, #0x06
 	beq	.Lmemcmp_6bytes
 	mov	r0, #0x00
 
 	/* Are both addresses aligned the same way? */
 	cmp	r2, #0x00
 	eornes	r3, ip, r1
 	RETeq			/* len == 0, or same addresses! */
 	tst	r3, #0x03
 	subne	r2, r2, #0x01
 	bne	.Lmemcmp_bytewise2	/* Badly aligned. Do it the slow way */
 
 	/* Word-align the addresses, if necessary */
 	sub	r3, r1, #0x05
 	ands	r3, r3, #0x03
 	add	r3, r3, r3, lsl #1
 	addne	pc, pc, r3, lsl #3
 	nop
 
 	/* Compare up to 3 bytes */
 	ldrb	r0, [ip], #0x01
 	ldrb	r3, [r1], #0x01
 	subs	r0, r0, r3
 	RETne
 	subs	r2, r2, #0x01
 	RETeq
 
 	/* Compare up to 2 bytes */
 	ldrb	r0, [ip], #0x01
 	ldrb	r3, [r1], #0x01
 	subs	r0, r0, r3
 	RETne
 	subs	r2, r2, #0x01
 	RETeq
 
 	/* Compare 1 byte */
 	ldrb	r0, [ip], #0x01
 	ldrb	r3, [r1], #0x01
 	subs	r0, r0, r3
 	RETne
 	subs	r2, r2, #0x01
 	RETeq
 
 	/* Compare 4 bytes at a time, if possible */
 	subs	r2, r2, #0x04
 	bcc	.Lmemcmp_bytewise
 .Lmemcmp_word_aligned:
 	ldr	r0, [ip], #0x04
 	ldr	r3, [r1], #0x04
 	subs	r2, r2, #0x04
 	cmpcs	r0, r3
 	beq	.Lmemcmp_word_aligned
 	sub	r0, r0, r3
 
 	/* Correct for extra subtraction, and check if done */
 	adds	r2, r2, #0x04
 	cmpeq	r0, #0x00		/* If done, did all bytes match? */
 	RETeq			/* Yup. Just return */
 
 	/* Re-do the final word byte-wise */
 	sub	ip, ip, #0x04
 	sub	r1, r1, #0x04
 
 .Lmemcmp_bytewise:
 	add	r2, r2, #0x03
 .Lmemcmp_bytewise2:
 	ldrb	r0, [ip], #0x01
 	ldrb	r3, [r1], #0x01
 	subs	r2, r2, #0x01
 	cmpcs	r0, r3
 	beq	.Lmemcmp_bytewise2
 	sub	r0, r0, r3
 	RET
 
 	/*
 	 * 6 byte compares are very common, thanks to the network stack.
 	 * This code is hand-scheduled to reduce the number of stalls for
 	 * load results. Everything else being equal, this will be ~32%
 	 * faster than a byte-wise memcmp.
 	 */
 	.align	5
 .Lmemcmp_6bytes:
 	ldrb	r3, [r1, #0x00]		/* r3 = b2#0 */
 	ldrb	r0, [ip, #0x00]		/* r0 = b1#0 */
 	ldrb	r2, [r1, #0x01]		/* r2 = b2#1 */
 	subs	r0, r0, r3		/* r0 = b1#0 - b2#0 */
 	ldreqb	r3, [ip, #0x01]		/* r3 = b1#1 */
 	RETne			/* Return if mismatch on #0 */
 	subs	r0, r3, r2		/* r0 = b1#1 - b2#1 */
 	ldreqb	r3, [r1, #0x02]		/* r3 = b2#2 */
 	ldreqb	r0, [ip, #0x02]		/* r0 = b1#2 */
 	RETne			/* Return if mismatch on #1 */
 	ldrb	r2, [r1, #0x03]		/* r2 = b2#3 */
 	subs	r0, r0, r3		/* r0 = b1#2 - b2#2 */
 	ldreqb	r3, [ip, #0x03]		/* r3 = b1#3 */
 	RETne			/* Return if mismatch on #2 */
 	subs	r0, r3, r2		/* r0 = b1#3 - b2#3 */
 	ldreqb	r3, [r1, #0x04]		/* r3 = b2#4 */
 	ldreqb	r0, [ip, #0x04]		/* r0 = b1#4 */
 	RETne			/* Return if mismatch on #3 */
 	ldrb	r2, [r1, #0x05]		/* r2 = b2#5 */
 	subs	r0, r0, r3		/* r0 = b1#4 - b2#4 */
 	ldreqb	r3, [ip, #0x05]		/* r3 = b1#5 */
 	RETne			/* Return if mismatch on #4 */
 	sub	r0, r3, r2		/* r0 = b1#5 - b2#5 */
 	RET
 END(bcmp)
 
 ENTRY(bcopy)
 	/* switch the source and destination registers */
 	eor     r0, r1, r0
 	eor     r1, r0, r1
 	eor     r0, r1, r0
-ENTRY(memmove)
+EENTRY(memmove)
 	/* Do the buffers overlap? */
 	cmp	r0, r1
 	RETeq		/* Bail now if src/dst are the same */
 	subcc	r3, r0, r1	/* if (dst > src) r3 = dst - src */
 	subcs	r3, r1, r0	/* if (src > dsr) r3 = src - dst */
 	cmp	r3, r2		/* if (r3 < len) we have an overlap */
 	bcc	PIC_SYM(_C_LABEL(memcpy), PLT)
 
 	/* Determine copy direction */
 	cmp	r1, r0
 	bcc	.Lmemmove_backwards
 
 	moveq	r0, #0			/* Quick abort for len=0 */
 	RETeq
 
 	stmdb	sp!, {r0, lr}		/* memmove() returns dest addr */
 	subs	r2, r2, #4
 	blt	.Lmemmove_fl4		/* less than 4 bytes */
 	ands	r12, r0, #3
 	bne	.Lmemmove_fdestul	/* oh unaligned destination addr */
 	ands	r12, r1, #3
 	bne	.Lmemmove_fsrcul		/* oh unaligned source addr */
 
 .Lmemmove_ft8:
 	/* We have aligned source and destination */
 	subs	r2, r2, #8
 	blt	.Lmemmove_fl12		/* less than 12 bytes (4 from above) */
 	subs	r2, r2, #0x14
 	blt	.Lmemmove_fl32		/* less than 32 bytes (12 from above) */
 	stmdb	sp!, {r4}		/* borrow r4 */
 
 	/* blat 32 bytes at a time */
 	/* XXX for really big copies perhaps we should use more registers */
 .Lmemmove_floop32:	
 	ldmia	r1!, {r3, r4, r12, lr}
 	stmia	r0!, {r3, r4, r12, lr}
 	ldmia	r1!, {r3, r4, r12, lr}
 	stmia	r0!, {r3, r4, r12, lr}
 	subs	r2, r2, #0x20
 	bge	.Lmemmove_floop32
 
 	cmn	r2, #0x10
 	ldmgeia	r1!, {r3, r4, r12, lr}	/* blat a remaining 16 bytes */
 	stmgeia	r0!, {r3, r4, r12, lr}
 	subge	r2, r2, #0x10
 	ldmia	sp!, {r4}		/* return r4 */
 
 .Lmemmove_fl32:
 	adds	r2, r2, #0x14
 
 	/* blat 12 bytes at a time */
 .Lmemmove_floop12:
 	ldmgeia	r1!, {r3, r12, lr}
 	stmgeia	r0!, {r3, r12, lr}
 	subges	r2, r2, #0x0c
 	bge	.Lmemmove_floop12
 
 .Lmemmove_fl12:
 	adds	r2, r2, #8
 	blt	.Lmemmove_fl4
 
 	subs	r2, r2, #4
 	ldrlt	r3, [r1], #4
 	strlt	r3, [r0], #4
 	ldmgeia	r1!, {r3, r12}
 	stmgeia	r0!, {r3, r12}
 	subge	r2, r2, #4
 
 .Lmemmove_fl4:
 	/* less than 4 bytes to go */
 	adds	r2, r2, #4
 	ldmeqia	sp!, {r0, pc}		/* done */
 
 	/* copy the crud byte at a time */
 	cmp	r2, #2
 	ldrb	r3, [r1], #1
 	strb	r3, [r0], #1
 	ldrgeb	r3, [r1], #1
 	strgeb	r3, [r0], #1
 	ldrgtb	r3, [r1], #1
 	strgtb	r3, [r0], #1
 	ldmia	sp!, {r0, pc}
 
 	/* erg - unaligned destination */
 .Lmemmove_fdestul:
 	rsb	r12, r12, #4
 	cmp	r12, #2
 
 	/* align destination with byte copies */
 	ldrb	r3, [r1], #1
 	strb	r3, [r0], #1
 	ldrgeb	r3, [r1], #1
 	strgeb	r3, [r0], #1
 	ldrgtb	r3, [r1], #1
 	strgtb	r3, [r0], #1
 	subs	r2, r2, r12
 	blt	.Lmemmove_fl4		/* less the 4 bytes */
 
 	ands	r12, r1, #3
 	beq	.Lmemmove_ft8		/* we have an aligned source */
 
 	/* erg - unaligned source */
 	/* This is where it gets nasty ... */
 .Lmemmove_fsrcul:
 	bic	r1, r1, #3
 	ldr	lr, [r1], #4
 	cmp	r12, #2
 	bgt	.Lmemmove_fsrcul3
 	beq	.Lmemmove_fsrcul2
 	cmp	r2, #0x0c
 	blt	.Lmemmove_fsrcul1loop4
 	sub	r2, r2, #0x0c
 	stmdb	sp!, {r4, r5}
 
 .Lmemmove_fsrcul1loop16:
 #ifdef __ARMEB__
 	mov	r3, lr, lsl #8
 #else
 	mov	r3, lr, lsr #8
 #endif
 	ldmia	r1!, {r4, r5, r12, lr}
 #ifdef __ARMEB__
 	orr	r3, r3, r4, lsr #24
 	mov	r4, r4, lsl #8
 	orr	r4, r4, r5, lsr #24
 	mov	r5, r5, lsl #8
 	orr	r5, r5, r12, lsr #24
 	mov	r12, r12, lsl #8
 	orr	r12, r12, lr, lsr #24
 #else
 	orr	r3, r3, r4, lsl #24
 	mov	r4, r4, lsr #8
 	orr	r4, r4, r5, lsl #24
 	mov	r5, r5, lsr #8
 	orr	r5, r5, r12, lsl #24
 	mov	r12, r12, lsr #8
 	orr	r12, r12, lr, lsl #24
 #endif
 	stmia	r0!, {r3-r5, r12}
 	subs	r2, r2, #0x10
 	bge	.Lmemmove_fsrcul1loop16
 	ldmia	sp!, {r4, r5}
 	adds	r2, r2, #0x0c
 	blt	.Lmemmove_fsrcul1l4
 
 .Lmemmove_fsrcul1loop4:
 #ifdef __ARMEB__
 	mov	r12, lr, lsl #8
 #else
 	mov	r12, lr, lsr #8
 #endif
 	ldr	lr, [r1], #4
 #ifdef __ARMEB__
 	orr	r12, r12, lr, lsr #24
 #else
 	orr	r12, r12, lr, lsl #24
 #endif
 	str	r12, [r0], #4
 	subs	r2, r2, #4
 	bge	.Lmemmove_fsrcul1loop4
 
 .Lmemmove_fsrcul1l4:
 	sub	r1, r1, #3
 	b	.Lmemmove_fl4
 
 .Lmemmove_fsrcul2:
 	cmp	r2, #0x0c
 	blt	.Lmemmove_fsrcul2loop4
 	sub	r2, r2, #0x0c
 	stmdb	sp!, {r4, r5}
 
 .Lmemmove_fsrcul2loop16:
 #ifdef __ARMEB__
 	mov	r3, lr, lsl #16
 #else
 	mov	r3, lr, lsr #16
 #endif
 	ldmia	r1!, {r4, r5, r12, lr}
 #ifdef __ARMEB__
 	orr	r3, r3, r4, lsr #16
 	mov	r4, r4, lsl #16
 	orr	r4, r4, r5, lsr #16
 	mov	r5, r5, lsl #16
 	orr	r5, r5, r12, lsr #16
 	mov	r12, r12, lsl #16
 	orr	r12, r12, lr, lsr #16
 #else
 	orr	r3, r3, r4, lsl #16
 	mov	r4, r4, lsr #16
 	orr	r4, r4, r5, lsl #16
 	mov	r5, r5, lsr #16
 	orr	r5, r5, r12, lsl #16
 	mov	r12, r12, lsr #16
 	orr	r12, r12, lr, lsl #16
 #endif
 	stmia	r0!, {r3-r5, r12}
 	subs	r2, r2, #0x10
 	bge	.Lmemmove_fsrcul2loop16
 	ldmia	sp!, {r4, r5}
 	adds	r2, r2, #0x0c
 	blt	.Lmemmove_fsrcul2l4
 
 .Lmemmove_fsrcul2loop4:
 #ifdef __ARMEB__
 	mov	r12, lr, lsl #16
 #else
 	mov	r12, lr, lsr #16
 #endif
 	ldr	lr, [r1], #4
 #ifdef __ARMEB__
 	orr	r12, r12, lr, lsr #16
 #else
 	orr	r12, r12, lr, lsl #16
 #endif
 	str	r12, [r0], #4
 	subs	r2, r2, #4
 	bge	.Lmemmove_fsrcul2loop4
 
 .Lmemmove_fsrcul2l4:
 	sub	r1, r1, #2
 	b	.Lmemmove_fl4
 
 .Lmemmove_fsrcul3:
 	cmp	r2, #0x0c
 	blt	.Lmemmove_fsrcul3loop4
 	sub	r2, r2, #0x0c
 	stmdb	sp!, {r4, r5}
 
 .Lmemmove_fsrcul3loop16:
 #ifdef __ARMEB__
 	mov	r3, lr, lsl #24
 #else
 	mov	r3, lr, lsr #24
 #endif
 	ldmia	r1!, {r4, r5, r12, lr}
 #ifdef __ARMEB__
 	orr	r3, r3, r4, lsr #8
 	mov	r4, r4, lsl #24
 	orr	r4, r4, r5, lsr #8
 	mov	r5, r5, lsl #24
 	orr	r5, r5, r12, lsr #8
 	mov	r12, r12, lsl #24
 	orr	r12, r12, lr, lsr #8
 #else
 	orr	r3, r3, r4, lsl #8
 	mov	r4, r4, lsr #24
 	orr	r4, r4, r5, lsl #8
 	mov	r5, r5, lsr #24
 	orr	r5, r5, r12, lsl #8
 	mov	r12, r12, lsr #24
 	orr	r12, r12, lr, lsl #8
 #endif
 	stmia	r0!, {r3-r5, r12}
 	subs	r2, r2, #0x10
 	bge	.Lmemmove_fsrcul3loop16
 	ldmia	sp!, {r4, r5}
 	adds	r2, r2, #0x0c
 	blt	.Lmemmove_fsrcul3l4
 
 .Lmemmove_fsrcul3loop4:
 #ifdef __ARMEB__
 	mov	r12, lr, lsl #24
 #else
 	mov	r12, lr, lsr #24
 #endif
 	ldr	lr, [r1], #4
 #ifdef __ARMEB__
 	orr	r12, r12, lr, lsr #8
 #else
 	orr	r12, r12, lr, lsl #8
 #endif
 	str	r12, [r0], #4
 	subs	r2, r2, #4
 	bge	.Lmemmove_fsrcul3loop4
 
 .Lmemmove_fsrcul3l4:
 	sub	r1, r1, #1
 	b	.Lmemmove_fl4
 
 .Lmemmove_backwards:
 	add	r1, r1, r2
 	add	r0, r0, r2
 	subs	r2, r2, #4
 	blt	.Lmemmove_bl4		/* less than 4 bytes */
 	ands	r12, r0, #3
 	bne	.Lmemmove_bdestul	/* oh unaligned destination addr */
 	ands	r12, r1, #3
 	bne	.Lmemmove_bsrcul		/* oh unaligned source addr */
 
 .Lmemmove_bt8:
 	/* We have aligned source and destination */
 	subs	r2, r2, #8
 	blt	.Lmemmove_bl12		/* less than 12 bytes (4 from above) */
 	stmdb	sp!, {r4, lr}
 	subs	r2, r2, #0x14		/* less than 32 bytes (12 from above) */
 	blt	.Lmemmove_bl32
 
 	/* blat 32 bytes at a time */
 	/* XXX for really big copies perhaps we should use more registers */
 .Lmemmove_bloop32:
 	ldmdb	r1!, {r3, r4, r12, lr}
 	stmdb	r0!, {r3, r4, r12, lr}
 	ldmdb	r1!, {r3, r4, r12, lr}
 	stmdb	r0!, {r3, r4, r12, lr}
 	subs	r2, r2, #0x20
 	bge	.Lmemmove_bloop32
 
 .Lmemmove_bl32:
 	cmn	r2, #0x10
 	ldmgedb	r1!, {r3, r4, r12, lr}	/* blat a remaining 16 bytes */
 	stmgedb	r0!, {r3, r4, r12, lr}
 	subge	r2, r2, #0x10
 	adds	r2, r2, #0x14
 	ldmgedb	r1!, {r3, r12, lr}	/* blat a remaining 12 bytes */
 	stmgedb	r0!, {r3, r12, lr}
 	subge	r2, r2, #0x0c
 	ldmia	sp!, {r4, lr}
 
 .Lmemmove_bl12:
 	adds	r2, r2, #8
 	blt	.Lmemmove_bl4
 	subs	r2, r2, #4
 	ldrlt	r3, [r1, #-4]!
 	strlt	r3, [r0, #-4]!
 	ldmgedb	r1!, {r3, r12}
 	stmgedb	r0!, {r3, r12}
 	subge	r2, r2, #4
 
 .Lmemmove_bl4:
 	/* less than 4 bytes to go */
 	adds	r2, r2, #4
 	RETeq			/* done */
 
 	/* copy the crud byte at a time */
 	cmp	r2, #2
 	ldrb	r3, [r1, #-1]!
 	strb	r3, [r0, #-1]!
 	ldrgeb	r3, [r1, #-1]!
 	strgeb	r3, [r0, #-1]!
 	ldrgtb	r3, [r1, #-1]!
 	strgtb	r3, [r0, #-1]!
 	RET
 
 	/* erg - unaligned destination */
 .Lmemmove_bdestul:
 	cmp	r12, #2
 
 	/* align destination with byte copies */
 	ldrb	r3, [r1, #-1]!
 	strb	r3, [r0, #-1]!
 	ldrgeb	r3, [r1, #-1]!
 	strgeb	r3, [r0, #-1]!
 	ldrgtb	r3, [r1, #-1]!
 	strgtb	r3, [r0, #-1]!
 	subs	r2, r2, r12
 	blt	.Lmemmove_bl4		/* less than 4 bytes to go */
 	ands	r12, r1, #3
 	beq	.Lmemmove_bt8		/* we have an aligned source */
 
 	/* erg - unaligned source */
 	/* This is where it gets nasty ... */
 .Lmemmove_bsrcul:
 	bic	r1, r1, #3
 	ldr	r3, [r1, #0]
 	cmp	r12, #2
 	blt	.Lmemmove_bsrcul1
 	beq	.Lmemmove_bsrcul2
 	cmp	r2, #0x0c
 	blt	.Lmemmove_bsrcul3loop4
 	sub	r2, r2, #0x0c
 	stmdb	sp!, {r4, r5, lr}
 
 .Lmemmove_bsrcul3loop16:
 #ifdef __ARMEB__
 	mov	lr, r3, lsr #8
 #else
 	mov	lr, r3, lsl #8
 #endif
 	ldmdb	r1!, {r3-r5, r12}
 #ifdef __ARMEB__
 	orr	lr, lr, r12, lsl #24
 	mov	r12, r12, lsr #8
 	orr	r12, r12, r5, lsl #24
 	mov	r5, r5, lsr #8
 	orr	r5, r5, r4, lsl #24
 	mov	r4, r4, lsr #8
 	orr	r4, r4, r3, lsl #24
 #else
 	orr	lr, lr, r12, lsr #24
 	mov	r12, r12, lsl #8
 	orr	r12, r12, r5, lsr #24
 	mov	r5, r5, lsl #8
 	orr	r5, r5, r4, lsr #24
 	mov	r4, r4, lsl #8
 	orr	r4, r4, r3, lsr #24
 #endif
 	stmdb	r0!, {r4, r5, r12, lr}
 	subs	r2, r2, #0x10
 	bge	.Lmemmove_bsrcul3loop16
 	ldmia	sp!, {r4, r5, lr}
 	adds	r2, r2, #0x0c
 	blt	.Lmemmove_bsrcul3l4
 
 .Lmemmove_bsrcul3loop4:
 #ifdef __ARMEB__
 	mov	r12, r3, lsr #8
 #else
 	mov	r12, r3, lsl #8
 #endif
 	ldr	r3, [r1, #-4]!
 #ifdef __ARMEB__
 	orr	r12, r12, r3, lsl #24
 #else
 	orr	r12, r12, r3, lsr #24
 #endif
 	str	r12, [r0, #-4]!
 	subs	r2, r2, #4
 	bge	.Lmemmove_bsrcul3loop4
 
 .Lmemmove_bsrcul3l4:
 	add	r1, r1, #3
 	b	.Lmemmove_bl4
 
 .Lmemmove_bsrcul2:
 	cmp	r2, #0x0c
 	blt	.Lmemmove_bsrcul2loop4
 	sub	r2, r2, #0x0c
 	stmdb	sp!, {r4, r5, lr}
 
 .Lmemmove_bsrcul2loop16:
 #ifdef __ARMEB__
 	mov	lr, r3, lsr #16
 #else
 	mov	lr, r3, lsl #16
 #endif
 	ldmdb	r1!, {r3-r5, r12}
 #ifdef __ARMEB__
 	orr	lr, lr, r12, lsl #16
 	mov	r12, r12, lsr #16
 	orr	r12, r12, r5, lsl #16
 	mov	r5, r5, lsr #16
 	orr	r5, r5, r4, lsl #16
 	mov	r4, r4, lsr #16
 	orr	r4, r4, r3, lsl #16
 #else
 	orr	lr, lr, r12, lsr #16
 	mov	r12, r12, lsl #16
 	orr	r12, r12, r5, lsr #16
 	mov	r5, r5, lsl #16
 	orr	r5, r5, r4, lsr #16
 	mov	r4, r4, lsl #16
 	orr	r4, r4, r3, lsr #16
 #endif
 	stmdb	r0!, {r4, r5, r12, lr}
 	subs	r2, r2, #0x10
 	bge	.Lmemmove_bsrcul2loop16
 	ldmia	sp!, {r4, r5, lr}
 	adds	r2, r2, #0x0c
 	blt	.Lmemmove_bsrcul2l4
 
 .Lmemmove_bsrcul2loop4:
 #ifdef __ARMEB__
 	mov	r12, r3, lsr #16
 #else
 	mov	r12, r3, lsl #16
 #endif
 	ldr	r3, [r1, #-4]!
 #ifdef __ARMEB__
 	orr	r12, r12, r3, lsl #16
 #else
 	orr	r12, r12, r3, lsr #16
 #endif
 	str	r12, [r0, #-4]!
 	subs	r2, r2, #4
 	bge	.Lmemmove_bsrcul2loop4
 
 .Lmemmove_bsrcul2l4:
 	add	r1, r1, #2
 	b	.Lmemmove_bl4
 
 .Lmemmove_bsrcul1:
 	cmp	r2, #0x0c
 	blt	.Lmemmove_bsrcul1loop4
 	sub	r2, r2, #0x0c
 	stmdb	sp!, {r4, r5, lr}
 
 .Lmemmove_bsrcul1loop32:
 #ifdef __ARMEB__
 	mov	lr, r3, lsr #24
 #else
 	mov	lr, r3, lsl #24
 #endif
 	ldmdb	r1!, {r3-r5, r12}
 #ifdef __ARMEB__
 	orr	lr, lr, r12, lsl #8
 	mov	r12, r12, lsr #24
 	orr	r12, r12, r5, lsl #8
 	mov	r5, r5, lsr #24
 	orr	r5, r5, r4, lsl #8
 	mov	r4, r4, lsr #24
 	orr	r4, r4, r3, lsl #8
 #else
 	orr	lr, lr, r12, lsr #8
 	mov	r12, r12, lsl #24
 	orr	r12, r12, r5, lsr #8
 	mov	r5, r5, lsl #24
 	orr	r5, r5, r4, lsr #8
 	mov	r4, r4, lsl #24
 	orr	r4, r4, r3, lsr #8
 #endif
 	stmdb	r0!, {r4, r5, r12, lr}
 	subs	r2, r2, #0x10
 	bge	.Lmemmove_bsrcul1loop32
 	ldmia	sp!, {r4, r5, lr}
 	adds	r2, r2, #0x0c
 	blt	.Lmemmove_bsrcul1l4
 
 .Lmemmove_bsrcul1loop4:
 #ifdef __ARMEB__
 	mov	r12, r3, lsr #24
 #else
 	mov	r12, r3, lsl #24
 #endif
 	ldr	r3, [r1, #-4]!
 #ifdef __ARMEB__
 	orr	r12, r12, r3, lsl #8
 #else
 	orr	r12, r12, r3, lsr #8
 #endif
 	str	r12, [r0, #-4]!
 	subs	r2, r2, #4
 	bge	.Lmemmove_bsrcul1loop4
 
 .Lmemmove_bsrcul1l4:
 	add	r1, r1, #1
 	b	.Lmemmove_bl4
+EEND(memmove)
 END(bcopy)
-END(memmove)
 
 #if !defined(_ARM_ARCH_5E)
 ENTRY(memcpy)
 	/* save leaf functions having to store this away */
 	/* Do not check arm_memcpy if we're running from flash */
 #if defined(FLASHADDR) && defined(PHYSADDR)
 #if FLASHADDR > PHYSADDR
 	ldr	r3, =FLASHADDR
 	cmp	r3, pc
 	bls	.Lnormal
 #else
 	ldr	r3, =FLASHADDR
 	cmp	r3, pc
 	bhi	.Lnormal
 #endif
 #endif
 	ldr	r3, .L_arm_memcpy
 	ldr	r3, [r3]
 	cmp	r3, #0
 	beq	.Lnormal
 	ldr	r3, .L_min_memcpy_size
 	ldr	r3, [r3]
 	cmp	r2, r3
 	blt	.Lnormal
 	stmfd	sp!, {r0-r2, r4, lr}
 	mov	r3, #0
 	ldr	r4, .L_arm_memcpy
 	mov	lr, pc
 	ldr	pc, [r4]
 	cmp	r0, #0
 	ldmfd	sp!, {r0-r2, r4, lr}
 	RETeq
 
 .Lnormal:
 	stmdb	sp!, {r0, lr}		/* memcpy() returns dest addr */
 
 	subs	r2, r2, #4
 	blt	.Lmemcpy_l4		/* less than 4 bytes */
 	ands	r12, r0, #3
 	bne	.Lmemcpy_destul		/* oh unaligned destination addr */
 	ands	r12, r1, #3
 	bne	.Lmemcpy_srcul		/* oh unaligned source addr */
 
 .Lmemcpy_t8:
 	/* We have aligned source and destination */
 	subs	r2, r2, #8
 	blt	.Lmemcpy_l12		/* less than 12 bytes (4 from above) */
 	subs	r2, r2, #0x14
 	blt	.Lmemcpy_l32		/* less than 32 bytes (12 from above) */
 	stmdb	sp!, {r4}		/* borrow r4 */
 
 	/* blat 32 bytes at a time */
 	/* XXX for really big copies perhaps we should use more registers */
 .Lmemcpy_loop32:	
 	ldmia	r1!, {r3, r4, r12, lr}
 	stmia	r0!, {r3, r4, r12, lr}
 	ldmia	r1!, {r3, r4, r12, lr}
 	stmia	r0!, {r3, r4, r12, lr}
 	subs	r2, r2, #0x20
 	bge	.Lmemcpy_loop32
 
 	cmn	r2, #0x10
 	ldmgeia	r1!, {r3, r4, r12, lr}	/* blat a remaining 16 bytes */
 	stmgeia	r0!, {r3, r4, r12, lr}
 	subge	r2, r2, #0x10
 	ldmia	sp!, {r4}		/* return r4 */
 
 .Lmemcpy_l32:
 	adds	r2, r2, #0x14
 
 	/* blat 12 bytes at a time */
 .Lmemcpy_loop12:
 	ldmgeia	r1!, {r3, r12, lr}
 	stmgeia	r0!, {r3, r12, lr}
 	subges	r2, r2, #0x0c
 	bge	.Lmemcpy_loop12
 
 .Lmemcpy_l12:
 	adds	r2, r2, #8
 	blt	.Lmemcpy_l4
 
 	subs	r2, r2, #4
 	ldrlt	r3, [r1], #4
 	strlt	r3, [r0], #4
 	ldmgeia	r1!, {r3, r12}
 	stmgeia	r0!, {r3, r12}
 	subge	r2, r2, #4
 
 .Lmemcpy_l4:
 	/* less than 4 bytes to go */
 	adds	r2, r2, #4
 #ifdef __APCS_26_
 	ldmeqia sp!, {r0, pc}^		/* done */
 #else
 	ldmeqia	sp!, {r0, pc}		/* done */
 #endif
 	/* copy the crud byte at a time */
 	cmp	r2, #2
 	ldrb	r3, [r1], #1
 	strb	r3, [r0], #1
 	ldrgeb	r3, [r1], #1
 	strgeb	r3, [r0], #1
 	ldrgtb	r3, [r1], #1
 	strgtb	r3, [r0], #1
 	ldmia	sp!, {r0, pc}
 
 	/* erg - unaligned destination */
 .Lmemcpy_destul:
 	rsb	r12, r12, #4
 	cmp	r12, #2
 
 	/* align destination with byte copies */
 	ldrb	r3, [r1], #1
 	strb	r3, [r0], #1
 	ldrgeb	r3, [r1], #1
 	strgeb	r3, [r0], #1
 	ldrgtb	r3, [r1], #1
 	strgtb	r3, [r0], #1
 	subs	r2, r2, r12
 	blt	.Lmemcpy_l4		/* less the 4 bytes */
 
 	ands	r12, r1, #3
 	beq	.Lmemcpy_t8		/* we have an aligned source */
 
 	/* erg - unaligned source */
 	/* This is where it gets nasty ... */
 .Lmemcpy_srcul:
 	bic	r1, r1, #3
 	ldr	lr, [r1], #4
 	cmp	r12, #2
 	bgt	.Lmemcpy_srcul3
 	beq	.Lmemcpy_srcul2
 	cmp	r2, #0x0c
 	blt	.Lmemcpy_srcul1loop4
 	sub	r2, r2, #0x0c
 	stmdb	sp!, {r4, r5}
 
 .Lmemcpy_srcul1loop16:
 	mov	r3, lr, lsr #8
 	ldmia	r1!, {r4, r5, r12, lr}
 	orr	r3, r3, r4, lsl #24
 	mov	r4, r4, lsr #8
 	orr	r4, r4, r5, lsl #24
 	mov	r5, r5, lsr #8
 	orr	r5, r5, r12, lsl #24
 	mov	r12, r12, lsr #8
 	orr	r12, r12, lr, lsl #24
 	stmia	r0!, {r3-r5, r12}
 	subs	r2, r2, #0x10
 	bge	.Lmemcpy_srcul1loop16
 	ldmia	sp!, {r4, r5}
 	adds	r2, r2, #0x0c
 	blt	.Lmemcpy_srcul1l4
 
 .Lmemcpy_srcul1loop4:
 	mov	r12, lr, lsr #8
 	ldr	lr, [r1], #4
 	orr	r12, r12, lr, lsl #24
 	str	r12, [r0], #4
 	subs	r2, r2, #4
 	bge	.Lmemcpy_srcul1loop4
 
 .Lmemcpy_srcul1l4:
 	sub	r1, r1, #3
 	b	.Lmemcpy_l4
 
 .Lmemcpy_srcul2:
 	cmp	r2, #0x0c
 	blt	.Lmemcpy_srcul2loop4
 	sub	r2, r2, #0x0c
 	stmdb	sp!, {r4, r5}
 
 .Lmemcpy_srcul2loop16:
 	mov	r3, lr, lsr #16
 	ldmia	r1!, {r4, r5, r12, lr}
 	orr	r3, r3, r4, lsl #16
 	mov	r4, r4, lsr #16
 	orr	r4, r4, r5, lsl #16
 	mov	r5, r5, lsr #16
 	orr	r5, r5, r12, lsl #16
 	mov	r12, r12, lsr #16
 	orr	r12, r12, lr, lsl #16
 	stmia	r0!, {r3-r5, r12}
 	subs	r2, r2, #0x10
 	bge	.Lmemcpy_srcul2loop16
 	ldmia	sp!, {r4, r5}
 	adds	r2, r2, #0x0c
 	blt	.Lmemcpy_srcul2l4
 
 .Lmemcpy_srcul2loop4:
 	mov	r12, lr, lsr #16
 	ldr	lr, [r1], #4
 	orr	r12, r12, lr, lsl #16
 	str	r12, [r0], #4
 	subs	r2, r2, #4
 	bge	.Lmemcpy_srcul2loop4
 
 .Lmemcpy_srcul2l4:
 	sub	r1, r1, #2
 	b	.Lmemcpy_l4
 
 .Lmemcpy_srcul3:
 	cmp	r2, #0x0c
 	blt	.Lmemcpy_srcul3loop4
 	sub	r2, r2, #0x0c
 	stmdb	sp!, {r4, r5}
 
 .Lmemcpy_srcul3loop16:
 	mov	r3, lr, lsr #24
 	ldmia	r1!, {r4, r5, r12, lr}
 	orr	r3, r3, r4, lsl #8
 	mov	r4, r4, lsr #24
 	orr	r4, r4, r5, lsl #8
 	mov	r5, r5, lsr #24
 	orr	r5, r5, r12, lsl #8
 	mov	r12, r12, lsr #24
 	orr	r12, r12, lr, lsl #8
 	stmia	r0!, {r3-r5, r12}
 	subs	r2, r2, #0x10
 	bge	.Lmemcpy_srcul3loop16
 	ldmia	sp!, {r4, r5}
 	adds	r2, r2, #0x0c
 	blt	.Lmemcpy_srcul3l4
 
 .Lmemcpy_srcul3loop4:
 	mov	r12, lr, lsr #24
 	ldr	lr, [r1], #4
 	orr	r12, r12, lr, lsl #8
 	str	r12, [r0], #4
 	subs	r2, r2, #4
 	bge	.Lmemcpy_srcul3loop4
 
 .Lmemcpy_srcul3l4:
 	sub	r1, r1, #1
 	b	.Lmemcpy_l4
 END(memcpy)
 
 #else
 /* LINTSTUB: Func: void *memcpy(void *dst, const void *src, size_t len) */
 ENTRY(memcpy)
 	pld	[r1]
 	cmp	r2, #0x0c
 	ble	.Lmemcpy_short		/* <= 12 bytes */
 #ifdef FLASHADDR
 #if FLASHADDR > PHYSADDR
 	ldr	r3, =FLASHADDR
 	cmp	r3, pc
 	bls	.Lnormal
 #else
 	ldr	r3, =FLASHADDR
 	cmp	r3, pc
 	bhi	.Lnormal
 #endif
 #endif
 	ldr	r3, .L_arm_memcpy
 	ldr	r3, [r3]
 	cmp	r3, #0
 	beq	.Lnormal
 	ldr	r3, .L_min_memcpy_size
 	ldr	r3, [r3]
 	cmp	r2, r3
 	blt	.Lnormal
 	stmfd	sp!, {r0-r2, r4, lr}
 	mov	r3, #0
 	ldr	r4, .L_arm_memcpy
 	mov	lr, pc
 	ldr	pc, [r4]
 	cmp	r0, #0
 	ldmfd	sp!, {r0-r2, r4, lr}
 	RETeq
 .Lnormal:
 	mov	r3, r0			/* We must not clobber r0 */
 
 	/* Word-align the destination buffer */
 	ands	ip, r3, #0x03		/* Already word aligned? */
 	beq	.Lmemcpy_wordaligned	/* Yup */
 	cmp	ip, #0x02
 	ldrb	ip, [r1], #0x01
 	sub	r2, r2, #0x01
 	strb	ip, [r3], #0x01
 	ldrleb	ip, [r1], #0x01
 	suble	r2, r2, #0x01
 	strleb	ip, [r3], #0x01
 	ldrltb	ip, [r1], #0x01
 	sublt	r2, r2, #0x01
 	strltb	ip, [r3], #0x01
 
 	/* Destination buffer is now word aligned */
 .Lmemcpy_wordaligned:
 	ands	ip, r1, #0x03		/* Is src also word-aligned? */
 	bne	.Lmemcpy_bad_align	/* Nope. Things just got bad */
 
 	/* Quad-align the destination buffer */
 	tst	r3, #0x07		/* Already quad aligned? */
 	ldrne	ip, [r1], #0x04
 	stmfd	sp!, {r4-r9}		/* Free up some registers */
 	subne	r2, r2, #0x04
 	strne	ip, [r3], #0x04
 
 	/* Destination buffer quad aligned, source is at least word aligned */
 	subs	r2, r2, #0x80
 	blt	.Lmemcpy_w_lessthan128
 
 	/* Copy 128 bytes at a time */
 .Lmemcpy_w_loop128:
 	ldr	r4, [r1], #0x04		/* LD:00-03 */
 	ldr	r5, [r1], #0x04		/* LD:04-07 */
 	pld	[r1, #0x18]		/* Prefetch 0x20 */
 	ldr	r6, [r1], #0x04		/* LD:08-0b */
 	ldr	r7, [r1], #0x04		/* LD:0c-0f */
 	ldr	r8, [r1], #0x04		/* LD:10-13 */
 	ldr	r9, [r1], #0x04		/* LD:14-17 */
 	strd	r4, [r3], #0x08		/* ST:00-07 */
 	ldr	r4, [r1], #0x04		/* LD:18-1b */
 	ldr	r5, [r1], #0x04		/* LD:1c-1f */
 	strd	r6, [r3], #0x08		/* ST:08-0f */
 	ldr	r6, [r1], #0x04		/* LD:20-23 */
 	ldr	r7, [r1], #0x04		/* LD:24-27 */
 	pld	[r1, #0x18]		/* Prefetch 0x40 */
 	strd	r8, [r3], #0x08		/* ST:10-17 */
 	ldr	r8, [r1], #0x04		/* LD:28-2b */
 	ldr	r9, [r1], #0x04		/* LD:2c-2f */
 	strd	r4, [r3], #0x08		/* ST:18-1f */
 	ldr	r4, [r1], #0x04		/* LD:30-33 */
 	ldr	r5, [r1], #0x04		/* LD:34-37 */
 	strd	r6, [r3], #0x08		/* ST:20-27 */
 	ldr	r6, [r1], #0x04		/* LD:38-3b */
 	ldr	r7, [r1], #0x04		/* LD:3c-3f */
 	strd	r8, [r3], #0x08		/* ST:28-2f */
 	ldr	r8, [r1], #0x04		/* LD:40-43 */
 	ldr	r9, [r1], #0x04		/* LD:44-47 */
 	pld	[r1, #0x18]		/* Prefetch 0x60 */
 	strd	r4, [r3], #0x08		/* ST:30-37 */
 	ldr	r4, [r1], #0x04		/* LD:48-4b */
 	ldr	r5, [r1], #0x04		/* LD:4c-4f */
 	strd	r6, [r3], #0x08		/* ST:38-3f */
 	ldr	r6, [r1], #0x04		/* LD:50-53 */
 	ldr	r7, [r1], #0x04		/* LD:54-57 */
 	strd	r8, [r3], #0x08		/* ST:40-47 */
 	ldr	r8, [r1], #0x04		/* LD:58-5b */
 	ldr	r9, [r1], #0x04		/* LD:5c-5f */
 	strd	r4, [r3], #0x08		/* ST:48-4f */
 	ldr	r4, [r1], #0x04		/* LD:60-63 */
 	ldr	r5, [r1], #0x04		/* LD:64-67 */
 	pld	[r1, #0x18]		/* Prefetch 0x80 */
 	strd	r6, [r3], #0x08		/* ST:50-57 */
 	ldr	r6, [r1], #0x04		/* LD:68-6b */
 	ldr	r7, [r1], #0x04		/* LD:6c-6f */
 	strd	r8, [r3], #0x08		/* ST:58-5f */
 	ldr	r8, [r1], #0x04		/* LD:70-73 */
 	ldr	r9, [r1], #0x04		/* LD:74-77 */
 	strd	r4, [r3], #0x08		/* ST:60-67 */
 	ldr	r4, [r1], #0x04		/* LD:78-7b */
 	ldr	r5, [r1], #0x04		/* LD:7c-7f */
 	strd	r6, [r3], #0x08		/* ST:68-6f */
 	strd	r8, [r3], #0x08		/* ST:70-77 */
 	subs	r2, r2, #0x80
 	strd	r4, [r3], #0x08		/* ST:78-7f */
 	bge	.Lmemcpy_w_loop128
 
 .Lmemcpy_w_lessthan128:
 	adds	r2, r2, #0x80		/* Adjust for extra sub */
 	ldmeqfd	sp!, {r4-r9}
 	RETeq			/* Return now if done */
 	subs	r2, r2, #0x20
 	blt	.Lmemcpy_w_lessthan32
 
 	/* Copy 32 bytes at a time */
 .Lmemcpy_w_loop32:
 	ldr	r4, [r1], #0x04
 	ldr	r5, [r1], #0x04
 	pld	[r1, #0x18]
 	ldr	r6, [r1], #0x04
 	ldr	r7, [r1], #0x04
 	ldr	r8, [r1], #0x04
 	ldr	r9, [r1], #0x04
 	strd	r4, [r3], #0x08
 	ldr	r4, [r1], #0x04
 	ldr	r5, [r1], #0x04
 	strd	r6, [r3], #0x08
 	strd	r8, [r3], #0x08
 	subs	r2, r2, #0x20
 	strd	r4, [r3], #0x08
 	bge	.Lmemcpy_w_loop32
 
 .Lmemcpy_w_lessthan32:
 	adds	r2, r2, #0x20		/* Adjust for extra sub */
 	ldmeqfd	sp!, {r4-r9}
 	RETeq			/* Return now if done */
 
 	and	r4, r2, #0x18
 	rsbs	r4, r4, #0x18
 	addne	pc, pc, r4, lsl #1
 	nop
 
 	/* At least 24 bytes remaining */
 	ldr	r4, [r1], #0x04
 	ldr	r5, [r1], #0x04
 	sub	r2, r2, #0x08
 	strd	r4, [r3], #0x08
 
 	/* At least 16 bytes remaining */
 	ldr	r4, [r1], #0x04
 	ldr	r5, [r1], #0x04
 	sub	r2, r2, #0x08
 	strd	r4, [r3], #0x08
 
 	/* At least 8 bytes remaining */
 	ldr	r4, [r1], #0x04
 	ldr	r5, [r1], #0x04
 	subs	r2, r2, #0x08
 	strd	r4, [r3], #0x08
 
 	/* Less than 8 bytes remaining */
 	ldmfd	sp!, {r4-r9}
 	RETeq			/* Return now if done */
 	subs	r2, r2, #0x04
 	ldrge	ip, [r1], #0x04
 	strge	ip, [r3], #0x04
 	RETeq			/* Return now if done */
 	addlt	r2, r2, #0x04
 	ldrb	ip, [r1], #0x01
 	cmp	r2, #0x02
 	ldrgeb	r2, [r1], #0x01
 	strb	ip, [r3], #0x01
 	ldrgtb	ip, [r1]
 	strgeb	r2, [r3], #0x01
 	strgtb	ip, [r3]
 	RET
 
 
 /*
  * At this point, it has not been possible to word align both buffers.
  * The destination buffer is word aligned, but the source buffer is not.
  */
 .Lmemcpy_bad_align:
 	stmfd	sp!, {r4-r7}
 	bic	r1, r1, #0x03
 	cmp	ip, #2
 	ldr	ip, [r1], #0x04
 	bgt	.Lmemcpy_bad3
 	beq	.Lmemcpy_bad2
 	b	.Lmemcpy_bad1
 
 .Lmemcpy_bad1_loop16:
 #ifdef __ARMEB__
 	mov	r4, ip, lsl #8
 #else
 	mov	r4, ip, lsr #8
 #endif
 	ldr	r5, [r1], #0x04
 	pld	[r1, #0x018]
 	ldr	r6, [r1], #0x04
 	ldr	r7, [r1], #0x04
 	ldr	ip, [r1], #0x04
 #ifdef __ARMEB__
 	orr	r4, r4, r5, lsr #24
 	mov	r5, r5, lsl #8
 	orr	r5, r5, r6, lsr #24
 	mov	r6, r6, lsl #8
 	orr	r6, r6, r7, lsr #24
 	mov	r7, r7, lsl #8
 	orr	r7, r7, ip, lsr #24
 #else
 	orr	r4, r4, r5, lsl #24
 	mov	r5, r5, lsr #8
 	orr	r5, r5, r6, lsl #24
 	mov	r6, r6, lsr #8
 	orr	r6, r6, r7, lsl #24
 	mov	r7, r7, lsr #8
 	orr	r7, r7, ip, lsl #24
 #endif
 	str	r4, [r3], #0x04
 	str	r5, [r3], #0x04
 	str	r6, [r3], #0x04
 	str	r7, [r3], #0x04
 .Lmemcpy_bad1:
 	subs	r2, r2, #0x10
 	bge	.Lmemcpy_bad1_loop16
 
 	adds	r2, r2, #0x10
 	ldmeqfd	sp!, {r4-r7}
 	RETeq			/* Return now if done */
 	subs	r2, r2, #0x04
 	sublt	r1, r1, #0x03
 	blt	.Lmemcpy_bad_done
 
 .Lmemcpy_bad1_loop4:
 #ifdef __ARMEB__
 	mov	r4, ip, lsl #8
 #else
 	mov	r4, ip, lsr #8
 #endif
 	ldr	ip, [r1], #0x04
 	subs	r2, r2, #0x04
 #ifdef __ARMEB__
 	orr	r4, r4, ip, lsr #24
 #else
 	orr	r4, r4, ip, lsl #24
 #endif
 	str	r4, [r3], #0x04
 	bge	.Lmemcpy_bad1_loop4
 	sub	r1, r1, #0x03
 	b	.Lmemcpy_bad_done
 
 .Lmemcpy_bad2_loop16:
 #ifdef __ARMEB__
 	mov	r4, ip, lsl #16
 #else
 	mov	r4, ip, lsr #16
 #endif
 	ldr	r5, [r1], #0x04
 	pld	[r1, #0x018]
 	ldr	r6, [r1], #0x04
 	ldr	r7, [r1], #0x04
 	ldr	ip, [r1], #0x04
 #ifdef __ARMEB__
 	orr	r4, r4, r5, lsr #16
 	mov	r5, r5, lsl #16
 	orr	r5, r5, r6, lsr #16
 	mov	r6, r6, lsl #16
 	orr	r6, r6, r7, lsr #16
 	mov	r7, r7, lsl #16
 	orr	r7, r7, ip, lsr #16
 #else
 	orr	r4, r4, r5, lsl #16
 	mov	r5, r5, lsr #16
 	orr	r5, r5, r6, lsl #16
 	mov	r6, r6, lsr #16
 	orr	r6, r6, r7, lsl #16
 	mov	r7, r7, lsr #16
 	orr	r7, r7, ip, lsl #16
 #endif
 	str	r4, [r3], #0x04
 	str	r5, [r3], #0x04
 	str	r6, [r3], #0x04
 	str	r7, [r3], #0x04
 .Lmemcpy_bad2:
 	subs	r2, r2, #0x10
 	bge	.Lmemcpy_bad2_loop16
 
 	adds	r2, r2, #0x10
 	ldmeqfd	sp!, {r4-r7}
 	RETeq			/* Return now if done */
 	subs	r2, r2, #0x04
 	sublt	r1, r1, #0x02
 	blt	.Lmemcpy_bad_done
 
 .Lmemcpy_bad2_loop4:
 #ifdef __ARMEB__
 	mov	r4, ip, lsl #16
 #else
 	mov	r4, ip, lsr #16
 #endif
 	ldr	ip, [r1], #0x04
 	subs	r2, r2, #0x04
 #ifdef __ARMEB__
 	orr	r4, r4, ip, lsr #16
 #else
 	orr	r4, r4, ip, lsl #16
 #endif
 	str	r4, [r3], #0x04
 	bge	.Lmemcpy_bad2_loop4
 	sub	r1, r1, #0x02
 	b	.Lmemcpy_bad_done
 
 .Lmemcpy_bad3_loop16:
 #ifdef __ARMEB__
 	mov	r4, ip, lsl #24
 #else
 	mov	r4, ip, lsr #24
 #endif
 	ldr	r5, [r1], #0x04
 	pld	[r1, #0x018]
 	ldr	r6, [r1], #0x04
 	ldr	r7, [r1], #0x04
 	ldr	ip, [r1], #0x04
 #ifdef __ARMEB__
 	orr	r4, r4, r5, lsr #8
 	mov	r5, r5, lsl #24
 	orr	r5, r5, r6, lsr #8
 	mov	r6, r6, lsl #24
 	orr	r6, r6, r7, lsr #8
 	mov	r7, r7, lsl #24
 	orr	r7, r7, ip, lsr #8
 #else
 	orr	r4, r4, r5, lsl #8
 	mov	r5, r5, lsr #24
 	orr	r5, r5, r6, lsl #8
 	mov	r6, r6, lsr #24
 	orr	r6, r6, r7, lsl #8
 	mov	r7, r7, lsr #24
 	orr	r7, r7, ip, lsl #8
 #endif
 	str	r4, [r3], #0x04
 	str	r5, [r3], #0x04
 	str	r6, [r3], #0x04
 	str	r7, [r3], #0x04
 .Lmemcpy_bad3:
 	subs	r2, r2, #0x10
 	bge	.Lmemcpy_bad3_loop16
 
 	adds	r2, r2, #0x10
 	ldmeqfd	sp!, {r4-r7}
 	RETeq			/* Return now if done */
 	subs	r2, r2, #0x04
 	sublt	r1, r1, #0x01
 	blt	.Lmemcpy_bad_done
 
 .Lmemcpy_bad3_loop4:
 #ifdef __ARMEB__
 	mov	r4, ip, lsl #24
 #else
 	mov	r4, ip, lsr #24
 #endif
 	ldr	ip, [r1], #0x04
 	subs	r2, r2, #0x04
 #ifdef __ARMEB__
 	orr	r4, r4, ip, lsr #8
 #else
 	orr	r4, r4, ip, lsl #8
 #endif
 	str	r4, [r3], #0x04
 	bge	.Lmemcpy_bad3_loop4
 	sub	r1, r1, #0x01
 
 .Lmemcpy_bad_done:
 	ldmfd	sp!, {r4-r7}
 	adds	r2, r2, #0x04
 	RETeq
 	ldrb	ip, [r1], #0x01
 	cmp	r2, #0x02
 	ldrgeb	r2, [r1], #0x01
 	strb	ip, [r3], #0x01
 	ldrgtb	ip, [r1]
 	strgeb	r2, [r3], #0x01
 	strgtb	ip, [r3]
 	RET
 
 
 /*
  * Handle short copies (less than 16 bytes), possibly misaligned.
  * Some of these are *very* common, thanks to the network stack,
  * and so are handled specially.
  */
 .Lmemcpy_short:
 	add	pc, pc, r2, lsl #2
 	nop
 	RET			/* 0x00 */
 	b	.Lmemcpy_bytewise	/* 0x01 */
 	b	.Lmemcpy_bytewise	/* 0x02 */
 	b	.Lmemcpy_bytewise	/* 0x03 */
 	b	.Lmemcpy_4		/* 0x04 */
 	b	.Lmemcpy_bytewise	/* 0x05 */
 	b	.Lmemcpy_6		/* 0x06 */
 	b	.Lmemcpy_bytewise	/* 0x07 */
 	b	.Lmemcpy_8		/* 0x08 */
 	b	.Lmemcpy_bytewise	/* 0x09 */
 	b	.Lmemcpy_bytewise	/* 0x0a */
 	b	.Lmemcpy_bytewise	/* 0x0b */
 	b	.Lmemcpy_c		/* 0x0c */
 .Lmemcpy_bytewise:
 	mov	r3, r0			/* We must not clobber r0 */
 	ldrb	ip, [r1], #0x01
 1:	subs	r2, r2, #0x01
 	strb	ip, [r3], #0x01
 	ldrneb	ip, [r1], #0x01
 	bne	1b
 	RET
 
 /******************************************************************************
  * Special case for 4 byte copies
  */
 #define	LMEMCPY_4_LOG2	6	/* 64 bytes */
 #define	LMEMCPY_4_PAD	.align LMEMCPY_4_LOG2
 	LMEMCPY_4_PAD
 .Lmemcpy_4:
 	and	r2, r1, #0x03
 	orr	r2, r2, r0, lsl #2
 	ands	r2, r2, #0x0f
 	sub	r3, pc, #0x14
 	addne	pc, r3, r2, lsl #LMEMCPY_4_LOG2
 
 /*
  * 0000: dst is 32-bit aligned, src is 32-bit aligned
  */
 	ldr	r2, [r1]
 	str	r2, [r0]
 	RET
 	LMEMCPY_4_PAD
 
 /*
  * 0001: dst is 32-bit aligned, src is 8-bit aligned
  */
 	ldr	r3, [r1, #-1]		/* BE:r3 = x012  LE:r3 = 210x */
 	ldr	r2, [r1, #3]		/* BE:r2 = 3xxx  LE:r2 = xxx3 */
 #ifdef __ARMEB__
 	mov	r3, r3, lsl #8		/* r3 = 012. */
 	orr	r3, r3, r2, lsr #24	/* r3 = 0123 */
 #else
 	mov	r3, r3, lsr #8		/* r3 = .210 */
 	orr	r3, r3, r2, lsl #24	/* r3 = 3210 */
 #endif
 	str	r3, [r0]
 	RET
 	LMEMCPY_4_PAD
 
 /*
  * 0010: dst is 32-bit aligned, src is 16-bit aligned
  */
 #ifdef __ARMEB__
 	ldrh	r3, [r1]
 	ldrh	r2, [r1, #0x02]
 #else
 	ldrh	r3, [r1, #0x02]
 	ldrh	r2, [r1]
 #endif
 	orr	r3, r2, r3, lsl #16
 	str	r3, [r0]
 	RET
 	LMEMCPY_4_PAD
 
 /*
  * 0011: dst is 32-bit aligned, src is 8-bit aligned
  */
 	ldr	r3, [r1, #-3]		/* BE:r3 = xxx0  LE:r3 = 0xxx */
 	ldr	r2, [r1, #1]		/* BE:r2 = 123x  LE:r2 = x321 */
 #ifdef __ARMEB__
 	mov	r3, r3, lsl #24		/* r3 = 0... */
 	orr	r3, r3, r2, lsr #8	/* r3 = 0123 */
 #else
 	mov	r3, r3, lsr #24		/* r3 = ...0 */
 	orr	r3, r3, r2, lsl #8	/* r3 = 3210 */
 #endif
 	str	r3, [r0]
 	RET
 	LMEMCPY_4_PAD
 
 /*
  * 0100: dst is 8-bit aligned, src is 32-bit aligned
  */
 	ldr	r2, [r1]
 #ifdef __ARMEB__
 	strb	r2, [r0, #0x03]
 	mov	r3, r2, lsr #8
 	mov	r1, r2, lsr #24
 	strb	r1, [r0]
 #else
 	strb	r2, [r0]
 	mov	r3, r2, lsr #8
 	mov	r1, r2, lsr #24
 	strb	r1, [r0, #0x03]
 #endif
 	strh	r3, [r0, #0x01]
 	RET
 	LMEMCPY_4_PAD
 
 /*
  * 0101: dst is 8-bit aligned, src is 8-bit aligned
  */
 	ldrb	r2, [r1]
 	ldrh	r3, [r1, #0x01]
 	ldrb	r1, [r1, #0x03]
 	strb	r2, [r0]
 	strh	r3, [r0, #0x01]
 	strb	r1, [r0, #0x03]
 	RET
 	LMEMCPY_4_PAD
 
 /*
  * 0110: dst is 8-bit aligned, src is 16-bit aligned
  */
 	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
 	ldrh	r3, [r1, #0x02]		/* LE:r3 = ..23  LE:r3 = ..32 */
 #ifdef __ARMEB__
 	mov	r1, r2, lsr #8		/* r1 = ...0 */
 	strb	r1, [r0]
 	mov	r2, r2, lsl #8		/* r2 = .01. */
 	orr	r2, r2, r3, lsr #8	/* r2 = .012 */
 #else
 	strb	r2, [r0]
 	mov	r2, r2, lsr #8		/* r2 = ...1 */
 	orr	r2, r2, r3, lsl #8	/* r2 = .321 */
 	mov	r3, r3, lsr #8		/* r3 = ...3 */
 #endif
 	strh	r2, [r0, #0x01]
 	strb	r3, [r0, #0x03]
 	RET
 	LMEMCPY_4_PAD
 
 /*
  * 0111: dst is 8-bit aligned, src is 8-bit aligned
  */
 	ldrb	r2, [r1]
 	ldrh	r3, [r1, #0x01]
 	ldrb	r1, [r1, #0x03]
 	strb	r2, [r0]
 	strh	r3, [r0, #0x01]
 	strb	r1, [r0, #0x03]
 	RET
 	LMEMCPY_4_PAD
 
 /*
  * 1000: dst is 16-bit aligned, src is 32-bit aligned
  */
 	ldr	r2, [r1]
 #ifdef __ARMEB__
 	strh	r2, [r0, #0x02]
 	mov	r3, r2, lsr #16
 	strh	r3, [r0]
 #else
 	strh	r2, [r0]
 	mov	r3, r2, lsr #16
 	strh	r3, [r0, #0x02]
 #endif
 	RET
 	LMEMCPY_4_PAD
 
 /*
  * 1001: dst is 16-bit aligned, src is 8-bit aligned
  */
 	ldr	r2, [r1, #-1]		/* BE:r2 = x012  LE:r2 = 210x */
 	ldr	r3, [r1, #3]		/* BE:r3 = 3xxx  LE:r3 = xxx3 */
 	mov	r1, r2, lsr #8		/* BE:r1 = .x01  LE:r1 = .210 */
 	strh	r1, [r0]
 #ifdef __ARMEB__
 	mov	r2, r2, lsl #8		/* r2 = 012. */
 	orr	r2, r2, r3, lsr #24	/* r2 = 0123 */
 #else
 	mov	r2, r2, lsr #24		/* r2 = ...2 */
 	orr	r2, r2, r3, lsl #8	/* r2 = xx32 */
 #endif
 	strh	r2, [r0, #0x02]
 	RET
 	LMEMCPY_4_PAD
 
 /*
  * 1010: dst is 16-bit aligned, src is 16-bit aligned
  */
 	ldrh	r2, [r1]
 	ldrh	r3, [r1, #0x02]
 	strh	r2, [r0]
 	strh	r3, [r0, #0x02]
 	RET
 	LMEMCPY_4_PAD
 
 /*
  * 1011: dst is 16-bit aligned, src is 8-bit aligned
  */
 	ldr	r3, [r1, #1]		/* BE:r3 = 123x  LE:r3 = x321 */
 	ldr	r2, [r1, #-3]		/* BE:r2 = xxx0  LE:r2 = 0xxx */
 	mov	r1, r3, lsr #8		/* BE:r1 = .123  LE:r1 = .x32 */
 	strh	r1, [r0, #0x02]
 #ifdef __ARMEB__
 	mov	r3, r3, lsr #24		/* r3 = ...1 */
 	orr	r3, r3, r2, lsl #8	/* r3 = xx01 */
 #else
 	mov	r3, r3, lsl #8		/* r3 = 321. */
 	orr	r3, r3, r2, lsr #24	/* r3 = 3210 */
 #endif
 	strh	r3, [r0]
 	RET
 	LMEMCPY_4_PAD
 
 /*
  * 1100: dst is 8-bit aligned, src is 32-bit aligned
  */
 	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
 #ifdef __ARMEB__
 	strb	r2, [r0, #0x03]
 	mov	r3, r2, lsr #8
 	mov	r1, r2, lsr #24
 	strh	r3, [r0, #0x01]
 	strb	r1, [r0]
 #else
 	strb	r2, [r0]
 	mov	r3, r2, lsr #8
 	mov	r1, r2, lsr #24
 	strh	r3, [r0, #0x01]
 	strb	r1, [r0, #0x03]
 #endif
 	RET
 	LMEMCPY_4_PAD
 
 /*
  * 1101: dst is 8-bit aligned, src is 8-bit aligned
  */
 	ldrb	r2, [r1]
 	ldrh	r3, [r1, #0x01]
 	ldrb	r1, [r1, #0x03]
 	strb	r2, [r0]
 	strh	r3, [r0, #0x01]
 	strb	r1, [r0, #0x03]
 	RET
 	LMEMCPY_4_PAD
 
 /*
  * 1110: dst is 8-bit aligned, src is 16-bit aligned
  */
 #ifdef __ARMEB__
 	ldrh	r3, [r1, #0x02]		/* BE:r3 = ..23  LE:r3 = ..32 */
 	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
 	strb	r3, [r0, #0x03]
 	mov	r3, r3, lsr #8		/* r3 = ...2 */
 	orr	r3, r3, r2, lsl #8	/* r3 = ..12 */
 	strh	r3, [r0, #0x01]
 	mov	r2, r2, lsr #8		/* r2 = ...0 */
 	strb	r2, [r0]
 #else
 	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
 	ldrh	r3, [r1, #0x02]		/* BE:r3 = ..23  LE:r3 = ..32 */
 	strb	r2, [r0]
 	mov	r2, r2, lsr #8		/* r2 = ...1 */
 	orr	r2, r2, r3, lsl #8	/* r2 = .321 */
 	strh	r2, [r0, #0x01]
 	mov	r3, r3, lsr #8		/* r3 = ...3 */
 	strb	r3, [r0, #0x03]
 #endif
 	RET
 	LMEMCPY_4_PAD
 
 /*
  * 1111: dst is 8-bit aligned, src is 8-bit aligned
  */
 	ldrb	r2, [r1]
 	ldrh	r3, [r1, #0x01]
 	ldrb	r1, [r1, #0x03]
 	strb	r2, [r0]
 	strh	r3, [r0, #0x01]
 	strb	r1, [r0, #0x03]
 	RET
 	LMEMCPY_4_PAD
 
 
 /******************************************************************************
  * Special case for 6 byte copies
  */
 #define	LMEMCPY_6_LOG2	6	/* 64 bytes */
 #define	LMEMCPY_6_PAD	.align LMEMCPY_6_LOG2
 	LMEMCPY_6_PAD
 .Lmemcpy_6:
 	and	r2, r1, #0x03
 	orr	r2, r2, r0, lsl #2
 	ands	r2, r2, #0x0f
 	sub	r3, pc, #0x14
 	addne	pc, r3, r2, lsl #LMEMCPY_6_LOG2
 
 /*
  * 0000: dst is 32-bit aligned, src is 32-bit aligned
  */
 	ldr	r2, [r1]
 	ldrh	r3, [r1, #0x04]
 	str	r2, [r0]
 	strh	r3, [r0, #0x04]
 	RET
 	LMEMCPY_6_PAD
 
 /*
  * 0001: dst is 32-bit aligned, src is 8-bit aligned
  */
 	ldr	r2, [r1, #-1]		/* BE:r2 = x012  LE:r2 = 210x */
 	ldr	r3, [r1, #0x03]		/* BE:r3 = 345x  LE:r3 = x543 */
 #ifdef __ARMEB__
 	mov	r2, r2, lsl #8		/* r2 = 012. */
 	orr	r2, r2, r3, lsr #24	/* r2 = 0123 */
 #else
 	mov	r2, r2, lsr #8		/* r2 = .210 */
 	orr	r2, r2, r3, lsl #24	/* r2 = 3210 */
 #endif
 	mov	r3, r3, lsr #8		/* BE:r3 = .345  LE:r3 = .x54 */
 	str	r2, [r0]
 	strh	r3, [r0, #0x04]
 	RET
 	LMEMCPY_6_PAD
 
 /*
  * 0010: dst is 32-bit aligned, src is 16-bit aligned
  */
 	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
 	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
 #ifdef __ARMEB__
 	mov	r1, r3, lsr #16		/* r1 = ..23 */
 	orr	r1, r1, r2, lsl #16	/* r1 = 0123 */
 	str	r1, [r0]
 	strh	r3, [r0, #0x04]
 #else
 	mov	r1, r3, lsr #16		/* r1 = ..54 */
 	orr	r2, r2, r3, lsl #16	/* r2 = 3210 */
 	str	r2, [r0]
 	strh	r1, [r0, #0x04]
 #endif
 	RET
 	LMEMCPY_6_PAD
 
 /*
  * 0011: dst is 32-bit aligned, src is 8-bit aligned
  */
 	ldr	r2, [r1, #-3]		/* BE:r2 = xxx0  LE:r2 = 0xxx */
 	ldr	r3, [r1, #1]		/* BE:r3 = 1234  LE:r3 = 4321 */
 	ldr	r1, [r1, #5]		/* BE:r1 = 5xxx  LE:r3 = xxx5 */
 #ifdef __ARMEB__
 	mov	r2, r2, lsl #24		/* r2 = 0... */
 	orr	r2, r2, r3, lsr #8	/* r2 = 0123 */
 	mov	r3, r3, lsl #8		/* r3 = 234. */
 	orr	r1, r3, r1, lsr #24	/* r1 = 2345 */
 #else
 	mov	r2, r2, lsr #24		/* r2 = ...0 */
 	orr	r2, r2, r3, lsl #8	/* r2 = 3210 */
 	mov	r1, r1, lsl #8		/* r1 = xx5. */
 	orr	r1, r1, r3, lsr #24	/* r1 = xx54 */
 #endif
 	str	r2, [r0]
 	strh	r1, [r0, #0x04]
 	RET
 	LMEMCPY_6_PAD
 
 /*
  * 0100: dst is 8-bit aligned, src is 32-bit aligned
  */
 	ldr	r3, [r1]		/* BE:r3 = 0123  LE:r3 = 3210 */
 	ldrh	r2, [r1, #0x04]		/* BE:r2 = ..45  LE:r2 = ..54 */
 	mov	r1, r3, lsr #8		/* BE:r1 = .012  LE:r1 = .321 */
 	strh	r1, [r0, #0x01]
 #ifdef __ARMEB__
 	mov	r1, r3, lsr #24		/* r1 = ...0 */
 	strb	r1, [r0]
 	mov	r3, r3, lsl #8		/* r3 = 123. */
 	orr	r3, r3, r2, lsr #8	/* r3 = 1234 */
 #else
 	strb	r3, [r0]
 	mov	r3, r3, lsr #24		/* r3 = ...3 */
 	orr	r3, r3, r2, lsl #8	/* r3 = .543 */
 	mov	r2, r2, lsr #8		/* r2 = ...5 */
 #endif
 	strh	r3, [r0, #0x03]
 	strb	r2, [r0, #0x05]
 	RET
 	LMEMCPY_6_PAD
 
 /*
  * 0101: dst is 8-bit aligned, src is 8-bit aligned
  */
 	ldrb	r2, [r1]
 	ldrh	r3, [r1, #0x01]
 	ldrh	ip, [r1, #0x03]
 	ldrb	r1, [r1, #0x05]
 	strb	r2, [r0]
 	strh	r3, [r0, #0x01]
 	strh	ip, [r0, #0x03]
 	strb	r1, [r0, #0x05]
 	RET
 	LMEMCPY_6_PAD
 
 /*
  * 0110: dst is 8-bit aligned, src is 16-bit aligned
  */
 	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
 	ldr	r1, [r1, #0x02]		/* BE:r1 = 2345  LE:r1 = 5432 */
 #ifdef __ARMEB__
 	mov	r3, r2, lsr #8		/* r3 = ...0 */
 	strb	r3, [r0]
 	strb	r1, [r0, #0x05]
 	mov	r3, r1, lsr #8		/* r3 = .234 */
 	strh	r3, [r0, #0x03]
 	mov	r3, r2, lsl #8		/* r3 = .01. */
 	orr	r3, r3, r1, lsr #24	/* r3 = .012 */
 	strh	r3, [r0, #0x01]
 #else
 	strb	r2, [r0]
 	mov	r3, r1, lsr #24
 	strb	r3, [r0, #0x05]
 	mov	r3, r1, lsr #8		/* r3 = .543 */
 	strh	r3, [r0, #0x03]
 	mov	r3, r2, lsr #8		/* r3 = ...1 */
 	orr	r3, r3, r1, lsl #8	/* r3 = 4321 */
 	strh	r3, [r0, #0x01]
 #endif
 	RET
 	LMEMCPY_6_PAD
 
 /*
  * 0111: dst is 8-bit aligned, src is 8-bit aligned
  */
 	ldrb	r2, [r1]
 	ldrh	r3, [r1, #0x01]
 	ldrh	ip, [r1, #0x03]
 	ldrb	r1, [r1, #0x05]
 	strb	r2, [r0]
 	strh	r3, [r0, #0x01]
 	strh	ip, [r0, #0x03]
 	strb	r1, [r0, #0x05]
 	RET
 	LMEMCPY_6_PAD
 
 /*
  * 1000: dst is 16-bit aligned, src is 32-bit aligned
  */
 #ifdef __ARMEB__
 	ldr	r2, [r1]		/* r2 = 0123 */
 	ldrh	r3, [r1, #0x04]		/* r3 = ..45 */
 	mov	r1, r2, lsr #16		/* r1 = ..01 */
 	orr	r3, r3, r2, lsl#16	/* r3 = 2345 */
 	strh	r1, [r0]
 	str	r3, [r0, #0x02]
 #else
 	ldrh	r2, [r1, #0x04]		/* r2 = ..54 */
 	ldr	r3, [r1]		/* r3 = 3210 */
 	mov	r2, r2, lsl #16		/* r2 = 54.. */
 	orr	r2, r2, r3, lsr #16	/* r2 = 5432 */
 	strh	r3, [r0]
 	str	r2, [r0, #0x02]
 #endif
 	RET
 	LMEMCPY_6_PAD
 
 /*
  * 1001: dst is 16-bit aligned, src is 8-bit aligned
  */
 	ldr	r3, [r1, #-1]		/* BE:r3 = x012  LE:r3 = 210x */
 	ldr	r2, [r1, #3]		/* BE:r2 = 345x  LE:r2 = x543 */
 	mov	r1, r3, lsr #8		/* BE:r1 = .x01  LE:r1 = .210 */
 #ifdef __ARMEB__
 	mov	r2, r2, lsr #8		/* r2 = .345 */
 	orr	r2, r2, r3, lsl #24	/* r2 = 2345 */
 #else
 	mov	r2, r2, lsl #8		/* r2 = 543. */
 	orr	r2, r2, r3, lsr #24	/* r2 = 5432 */
 #endif
 	strh	r1, [r0]
 	str	r2, [r0, #0x02]
 	RET
 	LMEMCPY_6_PAD
 
 /*
  * 1010: dst is 16-bit aligned, src is 16-bit aligned
  */
 	ldrh	r2, [r1]
 	ldr	r3, [r1, #0x02]
 	strh	r2, [r0]
 	str	r3, [r0, #0x02]
 	RET
 	LMEMCPY_6_PAD
 
 /*
  * 1011: dst is 16-bit aligned, src is 8-bit aligned
  */
 	ldrb	r3, [r1]		/* r3 = ...0 */
 	ldr	r2, [r1, #0x01]		/* BE:r2 = 1234  LE:r2 = 4321 */
 	ldrb	r1, [r1, #0x05]		/* r1 = ...5 */
 #ifdef __ARMEB__
 	mov	r3, r3, lsl #8		/* r3 = ..0. */
 	orr	r3, r3, r2, lsr #24	/* r3 = ..01 */
 	orr	r1, r1, r2, lsl #8	/* r1 = 2345 */
 #else
 	orr	r3, r3, r2, lsl #8	/* r3 = 3210 */
 	mov	r1, r1, lsl #24		/* r1 = 5... */
 	orr	r1, r1, r2, lsr #8	/* r1 = 5432 */
 #endif
 	strh	r3, [r0]
 	str	r1, [r0, #0x02]
 	RET
 	LMEMCPY_6_PAD
 
 /*
  * 1100: dst is 8-bit aligned, src is 32-bit aligned
  */
 	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
 	ldrh	r1, [r1, #0x04]		/* BE:r1 = ..45  LE:r1 = ..54 */
 #ifdef __ARMEB__
 	mov	r3, r2, lsr #24		/* r3 = ...0 */
 	strb	r3, [r0]
 	mov	r2, r2, lsl #8		/* r2 = 123. */
 	orr	r2, r2, r1, lsr #8	/* r2 = 1234 */
 #else
 	strb	r2, [r0]
 	mov	r2, r2, lsr #8		/* r2 = .321 */
 	orr	r2, r2, r1, lsl #24	/* r2 = 4321 */
 	mov	r1, r1, lsr #8		/* r1 = ...5 */
 #endif
 	str	r2, [r0, #0x01]
 	strb	r1, [r0, #0x05]
 	RET
 	LMEMCPY_6_PAD
 
 /*
  * 1101: dst is 8-bit aligned, src is 8-bit aligned
  */
 	ldrb	r2, [r1]
 	ldrh	r3, [r1, #0x01]
 	ldrh	ip, [r1, #0x03]
 	ldrb	r1, [r1, #0x05]
 	strb	r2, [r0]
 	strh	r3, [r0, #0x01]
 	strh	ip, [r0, #0x03]
 	strb	r1, [r0, #0x05]
 	RET
 	LMEMCPY_6_PAD
 
 /*
  * 1110: dst is 8-bit aligned, src is 16-bit aligned
  */
 	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
 	ldr	r1, [r1, #0x02]		/* BE:r1 = 2345  LE:r1 = 5432 */
 #ifdef __ARMEB__
 	mov	r3, r2, lsr #8		/* r3 = ...0 */
 	strb	r3, [r0]
 	mov	r2, r2, lsl #24		/* r2 = 1... */
 	orr	r2, r2, r1, lsr #8	/* r2 = 1234 */
 #else
 	strb	r2, [r0]
 	mov	r2, r2, lsr #8		/* r2 = ...1 */
 	orr	r2, r2, r1, lsl #8	/* r2 = 4321 */
 	mov	r1, r1, lsr #24		/* r1 = ...5 */
 #endif
 	str	r2, [r0, #0x01]
 	strb	r1, [r0, #0x05]
 	RET
 	LMEMCPY_6_PAD
 
 /*
  * 1111: dst is 8-bit aligned, src is 8-bit aligned
  */
 	ldrb	r2, [r1]
 	ldr	r3, [r1, #0x01]
 	ldrb	r1, [r1, #0x05]
 	strb	r2, [r0]
 	str	r3, [r0, #0x01]
 	strb	r1, [r0, #0x05]
 	RET
 	LMEMCPY_6_PAD
 
 
 /******************************************************************************
  * Special case for 8 byte copies
  */
 #define	LMEMCPY_8_LOG2	6	/* 64 bytes */
 #define	LMEMCPY_8_PAD	.align LMEMCPY_8_LOG2
 	LMEMCPY_8_PAD
 .Lmemcpy_8:
 	and	r2, r1, #0x03
 	orr	r2, r2, r0, lsl #2
 	ands	r2, r2, #0x0f
 	sub	r3, pc, #0x14
 	addne	pc, r3, r2, lsl #LMEMCPY_8_LOG2
 
 /*
  * 0000: dst is 32-bit aligned, src is 32-bit aligned
  */
 	ldr	r2, [r1]
 	ldr	r3, [r1, #0x04]
 	str	r2, [r0]
 	str	r3, [r0, #0x04]
 	RET
 	LMEMCPY_8_PAD
 
 /*
  * 0001: dst is 32-bit aligned, src is 8-bit aligned
  */
 	ldr	r3, [r1, #-1]		/* BE:r3 = x012  LE:r3 = 210x */
 	ldr	r2, [r1, #0x03]		/* BE:r2 = 3456  LE:r2 = 6543 */
 	ldrb	r1, [r1, #0x07]		/* r1 = ...7 */
 #ifdef __ARMEB__
 	mov	r3, r3, lsl #8		/* r3 = 012. */
 	orr	r3, r3, r2, lsr #24	/* r3 = 0123 */
 	orr	r2, r1, r2, lsl #8	/* r2 = 4567 */
 #else
 	mov	r3, r3, lsr #8		/* r3 = .210 */
 	orr	r3, r3, r2, lsl #24	/* r3 = 3210 */
 	mov	r1, r1, lsl #24		/* r1 = 7... */
 	orr	r2, r1, r2, lsr #8	/* r2 = 7654 */
 #endif
 	str	r3, [r0]
 	str	r2, [r0, #0x04]
 	RET
 	LMEMCPY_8_PAD
 
 /*
  * 0010: dst is 32-bit aligned, src is 16-bit aligned
  */
 	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
 	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
 	ldrh	r1, [r1, #0x06]		/* BE:r1 = ..67  LE:r1 = ..76 */
 #ifdef __ARMEB__
 	mov	r2, r2, lsl #16		/* r2 = 01.. */
 	orr	r2, r2, r3, lsr #16	/* r2 = 0123 */
 	orr	r3, r1, r3, lsl #16	/* r3 = 4567 */
 #else
 	orr	r2, r2, r3, lsl #16	/* r2 = 3210 */
 	mov	r3, r3, lsr #16		/* r3 = ..54 */
 	orr	r3, r3, r1, lsl #16	/* r3 = 7654 */
 #endif
 	str	r2, [r0]
 	str	r3, [r0, #0x04]
 	RET
 	LMEMCPY_8_PAD
 
 /*
  * 0011: dst is 32-bit aligned, src is 8-bit aligned
  */
 	ldrb	r3, [r1]		/* r3 = ...0 */
 	ldr	r2, [r1, #0x01]		/* BE:r2 = 1234  LE:r2 = 4321 */
 	ldr	r1, [r1, #0x05]		/* BE:r1 = 567x  LE:r1 = x765 */
 #ifdef __ARMEB__
 	mov	r3, r3, lsl #24		/* r3 = 0... */
 	orr	r3, r3, r2, lsr #8	/* r3 = 0123 */
 	mov	r2, r2, lsl #24		/* r2 = 4... */
 	orr	r2, r2, r1, lsr #8	/* r2 = 4567 */
 #else
 	orr	r3, r3, r2, lsl #8	/* r3 = 3210 */
 	mov	r2, r2, lsr #24		/* r2 = ...4 */
 	orr	r2, r2, r1, lsl #8	/* r2 = 7654 */
 #endif
 	str	r3, [r0]
 	str	r2, [r0, #0x04]
 	RET
 	LMEMCPY_8_PAD
 
 /*
  * 0100: dst is 8-bit aligned, src is 32-bit aligned
  */
 	ldr	r3, [r1]		/* BE:r3 = 0123  LE:r3 = 3210 */
 	ldr	r2, [r1, #0x04]		/* BE:r2 = 4567  LE:r2 = 7654 */
 #ifdef __ARMEB__
 	mov	r1, r3, lsr #24		/* r1 = ...0 */
 	strb	r1, [r0]
 	mov	r1, r3, lsr #8		/* r1 = .012 */
 	strb	r2, [r0, #0x07]
 	mov	r3, r3, lsl #24		/* r3 = 3... */
 	orr	r3, r3, r2, lsr #8	/* r3 = 3456 */
 #else
 	strb	r3, [r0]
 	mov	r1, r2, lsr #24		/* r1 = ...7 */
 	strb	r1, [r0, #0x07]
 	mov	r1, r3, lsr #8		/* r1 = .321 */
 	mov	r3, r3, lsr #24		/* r3 = ...3 */
 	orr	r3, r3, r2, lsl #8	/* r3 = 6543 */
 #endif
 	strh	r1, [r0, #0x01]
 	str	r3, [r0, #0x03]
 	RET
 	LMEMCPY_8_PAD
 
 /*
  * 0101: dst is 8-bit aligned, src is 8-bit aligned
  */
 	ldrb	r2, [r1]
 	ldrh	r3, [r1, #0x01]
 	ldr	ip, [r1, #0x03]
 	ldrb	r1, [r1, #0x07]
 	strb	r2, [r0]
 	strh	r3, [r0, #0x01]
 	str	ip, [r0, #0x03]
 	strb	r1, [r0, #0x07]
 	RET
 	LMEMCPY_8_PAD
 
 /*
  * 0110: dst is 8-bit aligned, src is 16-bit aligned
  */
 	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
 	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
 	ldrh	r1, [r1, #0x06]		/* BE:r1 = ..67  LE:r1 = ..76 */
 #ifdef __ARMEB__
 	mov	ip, r2, lsr #8		/* ip = ...0 */
 	strb	ip, [r0]
 	mov	ip, r2, lsl #8		/* ip = .01. */
 	orr	ip, ip, r3, lsr #24	/* ip = .012 */
 	strb	r1, [r0, #0x07]
 	mov	r3, r3, lsl #8		/* r3 = 345. */
 	orr	r3, r3, r1, lsr #8	/* r3 = 3456 */
 #else
 	strb	r2, [r0]		/* 0 */
 	mov	ip, r1, lsr #8		/* ip = ...7 */
 	strb	ip, [r0, #0x07]		/* 7 */
 	mov	ip, r2, lsr #8		/* ip = ...1 */
 	orr	ip, ip, r3, lsl #8	/* ip = 4321 */
 	mov	r3, r3, lsr #8		/* r3 = .543 */
 	orr	r3, r3, r1, lsl #24	/* r3 = 6543 */
 #endif
 	strh	ip, [r0, #0x01]
 	str	r3, [r0, #0x03]
 	RET
 	LMEMCPY_8_PAD
 
 /*
  * 0111: dst is 8-bit aligned, src is 8-bit aligned
  */
 	ldrb	r3, [r1]		/* r3 = ...0 */
 	ldr	ip, [r1, #0x01]		/* BE:ip = 1234  LE:ip = 4321 */
 	ldrh	r2, [r1, #0x05]		/* BE:r2 = ..56  LE:r2 = ..65 */
 	ldrb	r1, [r1, #0x07]		/* r1 = ...7 */
 	strb	r3, [r0]
 	mov	r3, ip, lsr #16		/* BE:r3 = ..12  LE:r3 = ..43 */
 #ifdef __ARMEB__
 	strh	r3, [r0, #0x01]
 	orr	r2, r2, ip, lsl #16	/* r2 = 3456 */
 #else
 	strh	ip, [r0, #0x01]
 	orr	r2, r3, r2, lsl #16	/* r2 = 6543 */
 #endif
 	str	r2, [r0, #0x03]
 	strb	r1, [r0, #0x07]
 	RET
 	LMEMCPY_8_PAD
 
 /*
  * 1000: dst is 16-bit aligned, src is 32-bit aligned
  */
 	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
 	ldr	r3, [r1, #0x04]		/* BE:r3 = 4567  LE:r3 = 7654 */
 	mov	r1, r2, lsr #16		/* BE:r1 = ..01  LE:r1 = ..32 */
 #ifdef __ARMEB__
 	strh	r1, [r0]
 	mov	r1, r3, lsr #16		/* r1 = ..45 */
 	orr	r2, r1 ,r2, lsl #16	/* r2 = 2345 */
 #else
 	strh	r2, [r0]
 	orr	r2, r1, r3, lsl #16	/* r2 = 5432 */
 	mov	r3, r3, lsr #16		/* r3 = ..76 */
 #endif
 	str	r2, [r0, #0x02]
 	strh	r3, [r0, #0x06]
 	RET
 	LMEMCPY_8_PAD
 
 /*
  * 1001: dst is 16-bit aligned, src is 8-bit aligned
  */
 	ldr	r2, [r1, #-1]		/* BE:r2 = x012  LE:r2 = 210x */
 	ldr	r3, [r1, #0x03]		/* BE:r3 = 3456  LE:r3 = 6543 */
 	ldrb	ip, [r1, #0x07]		/* ip = ...7 */
 	mov	r1, r2, lsr #8		/* BE:r1 = .x01  LE:r1 = .210 */
 	strh	r1, [r0]
 #ifdef __ARMEB__
 	mov	r1, r2, lsl #24		/* r1 = 2... */
 	orr	r1, r1, r3, lsr #8	/* r1 = 2345 */
 	orr	r3, ip, r3, lsl #8	/* r3 = 4567 */
 #else
 	mov	r1, r2, lsr #24		/* r1 = ...2 */
 	orr	r1, r1, r3, lsl #8	/* r1 = 5432 */
 	mov	r3, r3, lsr #24		/* r3 = ...6 */
 	orr	r3, r3, ip, lsl #8	/* r3 = ..76 */
 #endif
 	str	r1, [r0, #0x02]
 	strh	r3, [r0, #0x06]
 	RET
 	LMEMCPY_8_PAD
 
 /*
  * 1010: dst is 16-bit aligned, src is 16-bit aligned
  */
 	ldrh	r2, [r1]
 	ldr	ip, [r1, #0x02]
 	ldrh	r3, [r1, #0x06]
 	strh	r2, [r0]
 	str	ip, [r0, #0x02]
 	strh	r3, [r0, #0x06]
 	RET
 	LMEMCPY_8_PAD
 
 /*
  * 1011: dst is 16-bit aligned, src is 8-bit aligned
  */
 	ldr	r3, [r1, #0x05]		/* BE:r3 = 567x  LE:r3 = x765 */
 	ldr	r2, [r1, #0x01]		/* BE:r2 = 1234  LE:r2 = 4321 */
 	ldrb	ip, [r1]		/* ip = ...0 */
 	mov	r1, r3, lsr #8		/* BE:r1 = .567  LE:r1 = .x76 */
 	strh	r1, [r0, #0x06]
 #ifdef __ARMEB__
 	mov	r3, r3, lsr #24		/* r3 = ...5 */
 	orr	r3, r3, r2, lsl #8	/* r3 = 2345 */
 	mov	r2, r2, lsr #24		/* r2 = ...1 */
 	orr	r2, r2, ip, lsl #8	/* r2 = ..01 */
 #else
 	mov	r3, r3, lsl #24		/* r3 = 5... */
 	orr	r3, r3, r2, lsr #8	/* r3 = 5432 */
 	orr	r2, ip, r2, lsl #8	/* r2 = 3210 */
 #endif
 	str	r3, [r0, #0x02]
 	strh	r2, [r0]
 	RET
 	LMEMCPY_8_PAD
 
 /*
  * 1100: dst is 8-bit aligned, src is 32-bit aligned
  */
 	ldr	r3, [r1, #0x04]		/* BE:r3 = 4567  LE:r3 = 7654 */
 	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
 	mov	r1, r3, lsr #8		/* BE:r1 = .456  LE:r1 = .765 */
 	strh	r1, [r0, #0x05]
 #ifdef __ARMEB__
 	strb	r3, [r0, #0x07]
 	mov	r1, r2, lsr #24		/* r1 = ...0 */
 	strb	r1, [r0]
 	mov	r2, r2, lsl #8		/* r2 = 123. */
 	orr	r2, r2, r3, lsr #24	/* r2 = 1234 */
 	str	r2, [r0, #0x01]
 #else
 	strb	r2, [r0]
 	mov	r1, r3, lsr #24		/* r1 = ...7 */
 	strb	r1, [r0, #0x07]
 	mov	r2, r2, lsr #8		/* r2 = .321 */
 	orr	r2, r2, r3, lsl #24	/* r2 = 4321 */
 	str	r2, [r0, #0x01]
 #endif
 	RET
 	LMEMCPY_8_PAD
 
 /*
  * 1101: dst is 8-bit aligned, src is 8-bit aligned
  */
 	ldrb	r3, [r1]		/* r3 = ...0 */
 	ldrh	r2, [r1, #0x01]		/* BE:r2 = ..12  LE:r2 = ..21 */
 	ldr	ip, [r1, #0x03]		/* BE:ip = 3456  LE:ip = 6543 */
 	ldrb	r1, [r1, #0x07]		/* r1 = ...7 */
 	strb	r3, [r0]
 	mov	r3, ip, lsr #16		/* BE:r3 = ..34  LE:r3 = ..65 */
 #ifdef __ARMEB__
 	strh	ip, [r0, #0x05]
 	orr	r2, r3, r2, lsl #16	/* r2 = 1234 */
 #else
 	strh	r3, [r0, #0x05]
 	orr	r2, r2, ip, lsl #16	/* r2 = 4321 */
 #endif
 	str	r2, [r0, #0x01]
 	strb	r1, [r0, #0x07]
 	RET
 	LMEMCPY_8_PAD
 
 /*
  * 1110: dst is 8-bit aligned, src is 16-bit aligned
  */
 	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
 	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
 	ldrh	r1, [r1, #0x06]		/* BE:r1 = ..67  LE:r1 = ..76 */
 #ifdef __ARMEB__
 	mov	ip, r2, lsr #8		/* ip = ...0 */
 	strb	ip, [r0]
 	mov	ip, r2, lsl #24		/* ip = 1... */
 	orr	ip, ip, r3, lsr #8	/* ip = 1234 */
 	strb	r1, [r0, #0x07]
 	mov	r1, r1, lsr #8		/* r1 = ...6 */
 	orr	r1, r1, r3, lsl #8	/* r1 = 3456 */
 #else
 	strb	r2, [r0]
 	mov	ip, r2, lsr #8		/* ip = ...1 */
 	orr	ip, ip, r3, lsl #8	/* ip = 4321 */
 	mov	r2, r1, lsr #8		/* r2 = ...7 */
 	strb	r2, [r0, #0x07]
 	mov	r1, r1, lsl #8		/* r1 = .76. */
 	orr	r1, r1, r3, lsr #24	/* r1 = .765 */
 #endif
 	str	ip, [r0, #0x01]
 	strh	r1, [r0, #0x05]
 	RET
 	LMEMCPY_8_PAD
 
 /*
  * 1111: dst is 8-bit aligned, src is 8-bit aligned
  */
 	ldrb	r2, [r1]
 	ldr	ip, [r1, #0x01]
 	ldrh	r3, [r1, #0x05]
 	ldrb	r1, [r1, #0x07]
 	strb	r2, [r0]
 	str	ip, [r0, #0x01]
 	strh	r3, [r0, #0x05]
 	strb	r1, [r0, #0x07]
 	RET
 	LMEMCPY_8_PAD
 
 /******************************************************************************
  * Special case for 12 byte copies
  */
 #define	LMEMCPY_C_LOG2	7	/* 128 bytes */
 #define	LMEMCPY_C_PAD	.align LMEMCPY_C_LOG2
 	LMEMCPY_C_PAD
 .Lmemcpy_c:
 	and	r2, r1, #0x03
 	orr	r2, r2, r0, lsl #2
 	ands	r2, r2, #0x0f
 	sub	r3, pc, #0x14
 	addne	pc, r3, r2, lsl #LMEMCPY_C_LOG2
 
 /*
  * 0000: dst is 32-bit aligned, src is 32-bit aligned
  */
 	ldr	r2, [r1]
 	ldr	r3, [r1, #0x04]
 	ldr	r1, [r1, #0x08]
 	str	r2, [r0]
 	str	r3, [r0, #0x04]
 	str	r1, [r0, #0x08]
 	RET
 	LMEMCPY_C_PAD
 
 /*
  * 0001: dst is 32-bit aligned, src is 8-bit aligned
  */
 	ldrb	r2, [r1, #0xb]		/* r2 = ...B */
 	ldr	ip, [r1, #0x07]		/* BE:ip = 789A  LE:ip = A987 */
 	ldr	r3, [r1, #0x03]		/* BE:r3 = 3456  LE:r3 = 6543 */
 	ldr	r1, [r1, #-1]		/* BE:r1 = x012  LE:r1 = 210x */
 #ifdef __ARMEB__
 	orr	r2, r2, ip, lsl #8	/* r2 = 89AB */
 	str	r2, [r0, #0x08]
 	mov	r2, ip, lsr #24		/* r2 = ...7 */
 	orr	r2, r2, r3, lsl #8	/* r2 = 4567 */
 	mov	r1, r1, lsl #8		/* r1 = 012. */
 	orr	r1, r1, r3, lsr #24	/* r1 = 0123 */
 #else
 	mov	r2, r2, lsl #24		/* r2 = B... */
 	orr	r2, r2, ip, lsr #8	/* r2 = BA98 */
 	str	r2, [r0, #0x08]
 	mov	r2, ip, lsl #24		/* r2 = 7... */
 	orr	r2, r2, r3, lsr #8	/* r2 = 7654 */
 	mov	r1, r1, lsr #8		/* r1 = .210 */
 	orr	r1, r1, r3, lsl #24	/* r1 = 3210 */
 #endif
 	str	r2, [r0, #0x04]
 	str	r1, [r0]
 	RET
 	LMEMCPY_C_PAD
 
 /*
  * 0010: dst is 32-bit aligned, src is 16-bit aligned
  */
 	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
 	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
 	ldr	ip, [r1, #0x06]		/* BE:ip = 6789  LE:ip = 9876 */
 	ldrh	r1, [r1, #0x0a]		/* BE:r1 = ..AB  LE:r1 = ..BA */
 #ifdef __ARMEB__
 	mov	r2, r2, lsl #16		/* r2 = 01.. */
 	orr	r2, r2, r3, lsr #16	/* r2 = 0123 */
 	str	r2, [r0]
 	mov	r3, r3, lsl #16		/* r3 = 45.. */
 	orr	r3, r3, ip, lsr #16	/* r3 = 4567 */
 	orr	r1, r1, ip, lsl #16	/* r1 = 89AB */
 #else
 	orr	r2, r2, r3, lsl #16	/* r2 = 3210 */
 	str	r2, [r0]
 	mov	r3, r3, lsr #16		/* r3 = ..54 */
 	orr	r3, r3, ip, lsl #16	/* r3 = 7654 */
 	mov	r1, r1, lsl #16		/* r1 = BA.. */
 	orr	r1, r1, ip, lsr #16	/* r1 = BA98 */
 #endif
 	str	r3, [r0, #0x04]
 	str	r1, [r0, #0x08]
 	RET
 	LMEMCPY_C_PAD
 
 /*
  * 0011: dst is 32-bit aligned, src is 8-bit aligned
  */
 	ldrb	r2, [r1]		/* r2 = ...0 */
 	ldr	r3, [r1, #0x01]		/* BE:r3 = 1234  LE:r3 = 4321 */
 	ldr	ip, [r1, #0x05]		/* BE:ip = 5678  LE:ip = 8765 */
 	ldr	r1, [r1, #0x09]		/* BE:r1 = 9ABx  LE:r1 = xBA9 */
 #ifdef __ARMEB__
 	mov	r2, r2, lsl #24		/* r2 = 0... */
 	orr	r2, r2, r3, lsr #8	/* r2 = 0123 */
 	str	r2, [r0]
 	mov	r3, r3, lsl #24		/* r3 = 4... */
 	orr	r3, r3, ip, lsr #8	/* r3 = 4567 */
 	mov	r1, r1, lsr #8		/* r1 = .9AB */
 	orr	r1, r1, ip, lsl #24	/* r1 = 89AB */
 #else
 	orr	r2, r2, r3, lsl #8	/* r2 = 3210 */
 	str	r2, [r0]
 	mov	r3, r3, lsr #24		/* r3 = ...4 */
 	orr	r3, r3, ip, lsl #8	/* r3 = 7654 */
 	mov	r1, r1, lsl #8		/* r1 = BA9. */
 	orr	r1, r1, ip, lsr #24	/* r1 = BA98 */
 #endif
 	str	r3, [r0, #0x04]
 	str	r1, [r0, #0x08]
 	RET
 	LMEMCPY_C_PAD
 
 /*
  * 0100: dst is 8-bit aligned (byte 1), src is 32-bit aligned
  */
 	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
 	ldr	r3, [r1, #0x04]		/* BE:r3 = 4567  LE:r3 = 7654 */
 	ldr	ip, [r1, #0x08]		/* BE:ip = 89AB  LE:ip = BA98 */
 	mov	r1, r2, lsr #8		/* BE:r1 = .012  LE:r1 = .321 */
 	strh	r1, [r0, #0x01]
 #ifdef __ARMEB__
 	mov	r1, r2, lsr #24		/* r1 = ...0 */
 	strb	r1, [r0]
 	mov	r1, r2, lsl #24		/* r1 = 3... */
 	orr	r2, r1, r3, lsr #8	/* r1 = 3456 */
 	mov	r1, r3, lsl #24		/* r1 = 7... */
 	orr	r1, r1, ip, lsr #8	/* r1 = 789A */
 #else
 	strb	r2, [r0]
 	mov	r1, r2, lsr #24		/* r1 = ...3 */
 	orr	r2, r1, r3, lsl #8	/* r1 = 6543 */
 	mov	r1, r3, lsr #24		/* r1 = ...7 */
 	orr	r1, r1, ip, lsl #8	/* r1 = A987 */
 	mov	ip, ip, lsr #24		/* ip = ...B */
 #endif
 	str	r2, [r0, #0x03]
 	str	r1, [r0, #0x07]
 	strb	ip, [r0, #0x0b]
 	RET
 	LMEMCPY_C_PAD
 
 /*
  * 0101: dst is 8-bit aligned (byte 1), src is 8-bit aligned (byte 1)
  */
 	ldrb	r2, [r1]
 	ldrh	r3, [r1, #0x01]
 	ldr	ip, [r1, #0x03]
 	strb	r2, [r0]
 	ldr	r2, [r1, #0x07]
 	ldrb	r1, [r1, #0x0b]
 	strh	r3, [r0, #0x01]
 	str	ip, [r0, #0x03]
 	str	r2, [r0, #0x07]
 	strb	r1, [r0, #0x0b]
 	RET
 	LMEMCPY_C_PAD
 
 /*
  * 0110: dst is 8-bit aligned (byte 1), src is 16-bit aligned
  */
 	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
 	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
 	ldr	ip, [r1, #0x06]		/* BE:ip = 6789  LE:ip = 9876 */
 	ldrh	r1, [r1, #0x0a]		/* BE:r1 = ..AB  LE:r1 = ..BA */
 #ifdef __ARMEB__
 	mov	r2, r2, ror #8		/* r2 = 1..0 */
 	strb	r2, [r0]
 	mov	r2, r2, lsr #16		/* r2 = ..1. */
 	orr	r2, r2, r3, lsr #24	/* r2 = ..12 */
 	strh	r2, [r0, #0x01]
 	mov	r2, r3, lsl #8		/* r2 = 345. */
 	orr	r3, r2, ip, lsr #24	/* r3 = 3456 */
 	mov	r2, ip, lsl #8		/* r2 = 789. */
 	orr	r2, r2, r1, lsr #8	/* r2 = 789A */
 #else
 	strb	r2, [r0]
 	mov	r2, r2, lsr #8		/* r2 = ...1 */
 	orr	r2, r2, r3, lsl #8	/* r2 = 4321 */
 	strh	r2, [r0, #0x01]
 	mov	r2, r3, lsr #8		/* r2 = .543 */
 	orr	r3, r2, ip, lsl #24	/* r3 = 6543 */
 	mov	r2, ip, lsr #8		/* r2 = .987 */
 	orr	r2, r2, r1, lsl #24	/* r2 = A987 */
 	mov	r1, r1, lsr #8		/* r1 = ...B */
 #endif
 	str	r3, [r0, #0x03]
 	str	r2, [r0, #0x07]
 	strb	r1, [r0, #0x0b]
 	RET
 	LMEMCPY_C_PAD
 
 /*
  * 0111: dst is 8-bit aligned (byte 1), src is 8-bit aligned (byte 3)
  */
 	ldrb	r2, [r1]
 	ldr	r3, [r1, #0x01]		/* BE:r3 = 1234  LE:r3 = 4321 */
 	ldr	ip, [r1, #0x05]		/* BE:ip = 5678  LE:ip = 8765 */
 	ldr	r1, [r1, #0x09]		/* BE:r1 = 9ABx  LE:r1 = xBA9 */
 	strb	r2, [r0]
 #ifdef __ARMEB__
 	mov	r2, r3, lsr #16		/* r2 = ..12 */
 	strh	r2, [r0, #0x01]
 	mov	r3, r3, lsl #16		/* r3 = 34.. */
 	orr	r3, r3, ip, lsr #16	/* r3 = 3456 */
 	mov	ip, ip, lsl #16		/* ip = 78.. */
 	orr	ip, ip, r1, lsr #16	/* ip = 789A */
 	mov	r1, r1, lsr #8		/* r1 = .9AB */
 #else
 	strh	r3, [r0, #0x01]
 	mov	r3, r3, lsr #16		/* r3 = ..43 */
 	orr	r3, r3, ip, lsl #16	/* r3 = 6543 */
 	mov	ip, ip, lsr #16		/* ip = ..87 */
 	orr	ip, ip, r1, lsl #16	/* ip = A987 */
 	mov	r1, r1, lsr #16		/* r1 = ..xB */
 #endif
 	str	r3, [r0, #0x03]
 	str	ip, [r0, #0x07]
 	strb	r1, [r0, #0x0b]
 	RET
 	LMEMCPY_C_PAD
 
 /*
  * 1000: dst is 16-bit aligned, src is 32-bit aligned
  */
 	ldr	ip, [r1]		/* BE:ip = 0123  LE:ip = 3210 */
 	ldr	r3, [r1, #0x04]		/* BE:r3 = 4567  LE:r3 = 7654 */
 	ldr	r2, [r1, #0x08]		/* BE:r2 = 89AB  LE:r2 = BA98 */
 	mov	r1, ip, lsr #16		/* BE:r1 = ..01  LE:r1 = ..32 */
 #ifdef __ARMEB__
 	strh	r1, [r0]
 	mov	r1, ip, lsl #16		/* r1 = 23.. */
 	orr	r1, r1, r3, lsr #16	/* r1 = 2345 */
 	mov	r3, r3, lsl #16		/* r3 = 67.. */
 	orr	r3, r3, r2, lsr #16	/* r3 = 6789 */
 #else
 	strh	ip, [r0]
 	orr	r1, r1, r3, lsl #16	/* r1 = 5432 */
 	mov	r3, r3, lsr #16		/* r3 = ..76 */
 	orr	r3, r3, r2, lsl #16	/* r3 = 9876 */
 	mov	r2, r2, lsr #16		/* r2 = ..BA */
 #endif
 	str	r1, [r0, #0x02]
 	str	r3, [r0, #0x06]
 	strh	r2, [r0, #0x0a]
 	RET
 	LMEMCPY_C_PAD
 
 /*
  * 1001: dst is 16-bit aligned, src is 8-bit aligned (byte 1)
  */
 	ldr	r2, [r1, #-1]		/* BE:r2 = x012  LE:r2 = 210x */
 	ldr	r3, [r1, #0x03]		/* BE:r3 = 3456  LE:r3 = 6543 */
 	mov	ip, r2, lsr #8		/* BE:ip = .x01  LE:ip = .210 */
 	strh	ip, [r0]
 	ldr	ip, [r1, #0x07]		/* BE:ip = 789A  LE:ip = A987 */
 	ldrb	r1, [r1, #0x0b]		/* r1 = ...B */
 #ifdef __ARMEB__
 	mov	r2, r2, lsl #24		/* r2 = 2... */
 	orr	r2, r2, r3, lsr #8	/* r2 = 2345 */
 	mov	r3, r3, lsl #24		/* r3 = 6... */
 	orr	r3, r3, ip, lsr #8	/* r3 = 6789 */
 	orr	r1, r1, ip, lsl #8	/* r1 = 89AB */
 #else
 	mov	r2, r2, lsr #24		/* r2 = ...2 */
 	orr	r2, r2, r3, lsl #8	/* r2 = 5432 */
 	mov	r3, r3, lsr #24		/* r3 = ...6 */
 	orr	r3, r3, ip, lsl #8	/* r3 = 9876 */
 	mov	r1, r1, lsl #8		/* r1 = ..B. */
 	orr	r1, r1, ip, lsr #24	/* r1 = ..BA */
 #endif
 	str	r2, [r0, #0x02]
 	str	r3, [r0, #0x06]
 	strh	r1, [r0, #0x0a]
 	RET
 	LMEMCPY_C_PAD
 
 /*
  * 1010: dst is 16-bit aligned, src is 16-bit aligned
  */
 	ldrh	r2, [r1]
 	ldr	r3, [r1, #0x02]
 	ldr	ip, [r1, #0x06]
 	ldrh	r1, [r1, #0x0a]
 	strh	r2, [r0]
 	str	r3, [r0, #0x02]
 	str	ip, [r0, #0x06]
 	strh	r1, [r0, #0x0a]
 	RET
 	LMEMCPY_C_PAD
 
 /*
  * 1011: dst is 16-bit aligned, src is 8-bit aligned (byte 3)
  */
 	ldr	r2, [r1, #0x09]		/* BE:r2 = 9ABx  LE:r2 = xBA9 */
 	ldr	r3, [r1, #0x05]		/* BE:r3 = 5678  LE:r3 = 8765 */
 	mov	ip, r2, lsr #8		/* BE:ip = .9AB  LE:ip = .xBA */
 	strh	ip, [r0, #0x0a]
 	ldr	ip, [r1, #0x01]		/* BE:ip = 1234  LE:ip = 4321 */
 	ldrb	r1, [r1]		/* r1 = ...0 */
 #ifdef __ARMEB__
 	mov	r2, r2, lsr #24		/* r2 = ...9 */
 	orr	r2, r2, r3, lsl #8	/* r2 = 6789 */
 	mov	r3, r3, lsr #24		/* r3 = ...5 */
 	orr	r3, r3, ip, lsl #8	/* r3 = 2345 */
 	mov	r1, r1, lsl #8		/* r1 = ..0. */
 	orr	r1, r1, ip, lsr #24	/* r1 = ..01 */
 #else
 	mov	r2, r2, lsl #24		/* r2 = 9... */
 	orr	r2, r2, r3, lsr #8	/* r2 = 9876 */
 	mov	r3, r3, lsl #24		/* r3 = 5... */
 	orr	r3, r3, ip, lsr #8	/* r3 = 5432 */
 	orr	r1, r1, ip, lsl #8	/* r1 = 3210 */
 #endif
 	str	r2, [r0, #0x06]
 	str	r3, [r0, #0x02]
 	strh	r1, [r0]
 	RET
 	LMEMCPY_C_PAD
 
 /*
  * 1100: dst is 8-bit aligned (byte 3), src is 32-bit aligned
  */
 	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
 	ldr	ip, [r1, #0x04]		/* BE:ip = 4567  LE:ip = 7654 */
 	ldr	r1, [r1, #0x08]		/* BE:r1 = 89AB  LE:r1 = BA98 */
 #ifdef __ARMEB__
 	mov	r3, r2, lsr #24		/* r3 = ...0 */
 	strb	r3, [r0]
 	mov	r2, r2, lsl #8		/* r2 = 123. */
 	orr	r2, r2, ip, lsr #24	/* r2 = 1234 */
 	str	r2, [r0, #0x01]
 	mov	r2, ip, lsl #8		/* r2 = 567. */
 	orr	r2, r2, r1, lsr #24	/* r2 = 5678 */
 	str	r2, [r0, #0x05]
 	mov	r2, r1, lsr #8		/* r2 = ..9A */
 	strh	r2, [r0, #0x09]
 	strb	r1, [r0, #0x0b]
 #else
 	strb	r2, [r0]
 	mov	r3, r2, lsr #8		/* r3 = .321 */
 	orr	r3, r3, ip, lsl #24	/* r3 = 4321 */
 	str	r3, [r0, #0x01]
 	mov	r3, ip, lsr #8		/* r3 = .765 */
 	orr	r3, r3, r1, lsl #24	/* r3 = 8765 */
 	str	r3, [r0, #0x05]
 	mov	r1, r1, lsr #8		/* r1 = .BA9 */
 	strh	r1, [r0, #0x09]
 	mov	r1, r1, lsr #16		/* r1 = ...B */
 	strb	r1, [r0, #0x0b]
 #endif
 	RET
 	LMEMCPY_C_PAD
 
 /*
  * 1101: dst is 8-bit aligned (byte 3), src is 8-bit aligned (byte 1)
  */
 	ldrb	r2, [r1, #0x0b]		/* r2 = ...B */
 	ldr	r3, [r1, #0x07]		/* BE:r3 = 789A  LE:r3 = A987 */
 	ldr	ip, [r1, #0x03]		/* BE:ip = 3456  LE:ip = 6543 */
 	ldr	r1, [r1, #-1]		/* BE:r1 = x012  LE:r1 = 210x */
 	strb	r2, [r0, #0x0b]
 #ifdef __ARMEB__
 	strh	r3, [r0, #0x09]
 	mov	r3, r3, lsr #16		/* r3 = ..78 */
 	orr	r3, r3, ip, lsl #16	/* r3 = 5678 */
 	mov	ip, ip, lsr #16		/* ip = ..34 */
 	orr	ip, ip, r1, lsl #16	/* ip = 1234 */
 	mov	r1, r1, lsr #16		/* r1 = ..x0 */
 #else
 	mov	r2, r3, lsr #16		/* r2 = ..A9 */
 	strh	r2, [r0, #0x09]
 	mov	r3, r3, lsl #16		/* r3 = 87.. */
 	orr	r3, r3, ip, lsr #16	/* r3 = 8765 */
 	mov	ip, ip, lsl #16		/* ip = 43.. */
 	orr	ip, ip, r1, lsr #16	/* ip = 4321 */
 	mov	r1, r1, lsr #8		/* r1 = .210 */
 #endif
 	str	r3, [r0, #0x05]
 	str	ip, [r0, #0x01]
 	strb	r1, [r0]
 	RET
 	LMEMCPY_C_PAD
 
 /*
  * 1110: dst is 8-bit aligned (byte 3), src is 16-bit aligned
  */
 #ifdef __ARMEB__
 	ldrh	r2, [r1, #0x0a]		/* r2 = ..AB */
 	ldr	ip, [r1, #0x06]		/* ip = 6789 */
 	ldr	r3, [r1, #0x02]		/* r3 = 2345 */
 	ldrh	r1, [r1]		/* r1 = ..01 */
 	strb	r2, [r0, #0x0b]
 	mov	r2, r2, lsr #8		/* r2 = ...A */
 	orr	r2, r2, ip, lsl #8	/* r2 = 789A */
 	mov	ip, ip, lsr #8		/* ip = .678 */
 	orr	ip, ip, r3, lsl #24	/* ip = 5678 */
 	mov	r3, r3, lsr #8		/* r3 = .234 */
 	orr	r3, r3, r1, lsl #24	/* r3 = 1234 */
 	mov	r1, r1, lsr #8		/* r1 = ...0 */
 	strb	r1, [r0]
 	str	r3, [r0, #0x01]
 	str	ip, [r0, #0x05]
 	strh	r2, [r0, #0x09]
 #else
 	ldrh	r2, [r1]		/* r2 = ..10 */
 	ldr	r3, [r1, #0x02]		/* r3 = 5432 */
 	ldr	ip, [r1, #0x06]		/* ip = 9876 */
 	ldrh	r1, [r1, #0x0a]		/* r1 = ..BA */
 	strb	r2, [r0]
 	mov	r2, r2, lsr #8		/* r2 = ...1 */
 	orr	r2, r2, r3, lsl #8	/* r2 = 4321 */
 	mov	r3, r3, lsr #24		/* r3 = ...5 */
 	orr	r3, r3, ip, lsl #8	/* r3 = 8765 */
 	mov	ip, ip, lsr #24		/* ip = ...9 */
 	orr	ip, ip, r1, lsl #8	/* ip = .BA9 */
 	mov	r1, r1, lsr #8		/* r1 = ...B */
 	str	r2, [r0, #0x01]
 	str	r3, [r0, #0x05]
 	strh	ip, [r0, #0x09]
 	strb	r1, [r0, #0x0b]
 #endif
 	RET
 	LMEMCPY_C_PAD
 
 /*
  * 1111: dst is 8-bit aligned (byte 3), src is 8-bit aligned (byte 3)
  */
 	ldrb	r2, [r1]
 	ldr	r3, [r1, #0x01]
 	ldr	ip, [r1, #0x05]
 	strb	r2, [r0]
 	ldrh	r2, [r1, #0x09]
 	ldrb	r1, [r1, #0x0b]
 	str	r3, [r0, #0x01]
 	str	ip, [r0, #0x05]
 	strh	r2, [r0, #0x09]
 	strb	r1, [r0, #0x0b]
 	RET
 END(memcpy)
 #endif /* _ARM_ARCH_5E */
 
 #ifdef GPROF
 
 ENTRY(user)
 	nop
+END(user)
 ENTRY(btrap)
 	nop
+END(btrap)
 ENTRY(etrap)
 	nop
+END(etrap)
 ENTRY(bintr)
 	nop
+END(bintr)
 ENTRY(eintr)
 	nop
-
+END(eintr)
 #endif
Index: stable/10/sys/arm/include/asm.h
===================================================================
--- stable/10/sys/arm/include/asm.h	(revision 269795)
+++ stable/10/sys/arm/include/asm.h	(revision 269796)
@@ -1,195 +1,212 @@
 /*	$NetBSD: asm.h,v 1.5 2003/08/07 16:26:53 agc Exp $	*/
 
 /*-
  * Copyright (c) 1990 The Regents of the University of California.
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)asm.h	5.5 (Berkeley) 5/7/91
  *
  * $FreeBSD$
  */
 
 #ifndef _MACHINE_ASM_H_
 #define _MACHINE_ASM_H_
 #include <sys/cdefs.h>
 
 #define	_C_LABEL(x)	x
 #define	_ASM_LABEL(x)	x
 
 #define I32_bit (1 << 7)	/* IRQ disable */
 #define F32_bit (1 << 6)        /* FIQ disable */
 
 #define CPU_CONTROL_32BP_ENABLE 0x00000010 /* P: 32-bit exception handlers */
 #define CPU_CONTROL_32BD_ENABLE 0x00000020 /* D: 32-bit addressing */
 
 #ifndef _ALIGN_TEXT
 # define _ALIGN_TEXT .align 0
 #endif
 
 #ifdef __ARM_EABI__
 #define	STOP_UNWINDING	.cantunwind
 #define	_FNSTART	.fnstart
 #define	_FNEND		.fnend
 #else
 #define	STOP_UNWINDING
 #define	_FNSTART
 #define	_FNEND
 #endif
 
 /*
  * gas/arm uses @ as a single comment character and thus cannot be used here
  * Instead it recognised the # instead of an @ symbols in .type directives
  * We define a couple of macros so that assembly code will not be dependent
  * on one or the other.
  */
 #define _ASM_TYPE_FUNCTION	#function
 #define _ASM_TYPE_OBJECT	#object
 #define GLOBAL(X) .globl x
 #define _ENTRY(x) \
 	.text; _ALIGN_TEXT; .globl x; .type x,_ASM_TYPE_FUNCTION; x: _FNSTART
-
 #define	_END(x)	.size x, . - x; _FNEND
 
+/*
+ * EENTRY()/EEND() mark "extra" entry/exit points from a function.
+ * The unwind info cannot handle the concept of a nested function, or a function
+ * with multiple .fnstart directives, but some of our assembler code is written
+ * with multiple labels to allow entry at several points.  The EENTRY() macro
+ * defines such an extra entry point without a new .fnstart, so that it's
+ * basically just a label that you can jump to.  The EEND() macro does nothing
+ * at all, except document the exit point associated with the same-named entry.
+ */
+#define _EENTRY(x) 	.globl x; .type x,_ASM_TYPE_FUNCTION; x:
+#define _EEND(x)	/* nothing */
+
 #ifdef GPROF
 #  define _PROF_PROLOGUE	\
 	mov ip, lr; bl __mcount
 #else
 # define _PROF_PROLOGUE
 #endif
 
 #define	ENTRY(y)	_ENTRY(_C_LABEL(y)); _PROF_PROLOGUE
+#define	EENTRY(y)	_EENTRY(_C_LABEL(y)); _PROF_PROLOGUE
 #define	ENTRY_NP(y)	_ENTRY(_C_LABEL(y))
+#define	EENTRY_NP(y)	_EENTRY(_C_LABEL(y))
 #define	END(y)		_END(_C_LABEL(y))
+#define	EEND(y)
 #define	ASENTRY(y)	_ENTRY(_ASM_LABEL(y)); _PROF_PROLOGUE
+#define	ASEENTRY(y)	_EENTRY(_ASM_LABEL(y)); _PROF_PROLOGUE
 #define	ASENTRY_NP(y)	_ENTRY(_ASM_LABEL(y))
+#define	ASEENTRY_NP(y)	_EENTRY(_ASM_LABEL(y))
 #define	ASEND(y)	_END(_ASM_LABEL(y))
+#define	ASEEND(y)
 
 #define	ASMSTR		.asciz
 
 #if defined(PIC)
 #define	PLT_SYM(x)	PIC_SYM(x, PLT)
 #define	GOT_SYM(x)	PIC_SYM(x, GOT)
 #define	GOT_GET(x,got,sym)	\
 	ldr	x, sym;		\
 	ldr	x, [x, got]
 #define	GOT_INIT(got,gotsym,pclabel) \
 	ldr	got, gotsym;	\
 	add	got, got, pc;	\
 	pclabel:
 #define	GOT_INITSYM(gotsym,pclabel) \
 	gotsym: .word _C_LABEL(_GLOBAL_OFFSET_TABLE_) + (. - (pclabel+4))
 
 #ifdef __STDC__
 #define	PIC_SYM(x,y)	x ## ( ## y ## )
 #else
 #define	PIC_SYM(x,y)	x/**/(/**/y/**/)
 #endif
 
 #else
 #define	PLT_SYM(x)	x
 #define	GOT_SYM(x)	x
 #define	GOT_GET(x,got,sym)	\
 	ldr	x, sym;
 #define	GOT_INIT(got,gotsym,pclabel)
 #define	GOT_INITSYM(gotsym,pclabel)
 #define	PIC_SYM(x,y)	x
 #endif	/* PIC */
 
 #undef __FBSDID
 #if !defined(lint) && !defined(STRIP_FBSDID)
 #define __FBSDID(s)     .ident s
 #else
 #define __FBSDID(s)     /* nothing */
 #endif
 	
 
 #define	WEAK_ALIAS(alias,sym)						\
 	.weak alias;							\
 	alias = sym
 
 #ifdef __STDC__
 #define	WARN_REFERENCES(sym,msg)					\
 	.stabs msg ## ,30,0,0,0 ;					\
 	.stabs __STRING(_C_LABEL(sym)) ## ,1,0,0,0
 #else
 #define	WARN_REFERENCES(sym,msg)					\
 	.stabs msg,30,0,0,0 ;						\
 	.stabs __STRING(sym),1,0,0,0
 #endif /* __STDC__ */
 
 /* Exactly one of the __ARM_ARCH_*__ macros will be defined by the compiler. */
 /* The _ARM_ARCH_* macros are deprecated and will be removed soon. */
 /* This should be moved into another header so it can be used in
  * both asm and C code. machine/asm.h cannot be included in C code. */
 #if defined (__ARM_ARCH_7__) || defined (__ARM_ARCH_7A__)
 #define _ARM_ARCH_7
 #define _HAVE_ARMv7_INSTRUCTIONS 1
 #endif
 
 #if defined (_HAVE_ARMv7_INSTRUCTIONS) || defined (__ARM_ARCH_6__) || \
 	defined (__ARM_ARCH_6J__) || defined (__ARM_ARCH_6K__) || \
 	defined (__ARM_ARCH_6Z__) || defined (__ARM_ARCH_6ZK__)
 #define _ARM_ARCH_6
 #define _HAVE_ARMv6_INSTRUCTIONS 1
 #endif
 
 #if defined (_HAVE_ARMv6_INSTRUCTIONS) || defined (__ARM_ARCH_5TE__) || \
     defined (__ARM_ARCH_5TEJ__) || defined (__ARM_ARCH_5E__)
 #define _ARM_ARCH_5E
 #define _HAVE_ARMv5E_INSTRUCTIONS 1
 #endif
 
 #if defined (_HAVE_ARMv5E_INSTRUCTIONS) || defined (__ARM_ARCH_5__) || \
     defined (__ARM_ARCH_5T__)
 #define _ARM_ARCH_5
 #define _HAVE_ARMv5_INSTRUCTIONS 1
 #endif
 
 #if defined (_HAVE_ARMv5_INSTRUCTIONS) || defined (__ARM_ARCH_4T__)
 #define _ARM_ARCH_4T
 #define _HAVE_ARMv4T_INSTRUCTIONS 1
 #endif
 
 /* FreeBSD requires ARMv4, so this is always set. */
 #define _HAVE_ARMv4_INSTRUCTIONS 1
 
 #if defined (_HAVE_ARMv4T_INSTRUCTIONS)
 # define RET	bx	lr
 # define RETeq	bxeq	lr
 # define RETne	bxne	lr
 # define RETc(c) bx##c	lr
 #else
 # define RET	mov	pc, lr
 # define RETeq	moveq	pc, lr
 # define RETne	movne	pc, lr
 # define RETc(c) mov##c	pc, lr
 #endif
 
 #endif /* !_MACHINE_ASM_H_ */
Index: stable/10/sys/libkern/arm/divsi3.S
===================================================================
--- stable/10/sys/libkern/arm/divsi3.S	(revision 269795)
+++ stable/10/sys/libkern/arm/divsi3.S	(revision 269796)
@@ -1,408 +1,408 @@
 /*	$NetBSD: divsi3.S,v 1.4 2003/04/05 23:27:15 bjh21 Exp $	*/
 
 /*-
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <machine/asm.h>
 __FBSDID("$FreeBSD$");
 
 /* 
  * stack is aligned as there's a possibility of branching to L_overflow
  * which makes a C call
  */
 
 ENTRY_NP(__umodsi3)
 	stmfd	sp!, {lr}
 	sub	sp, sp, #4	/* align stack */
 	bl	.L_udivide
 	add	sp, sp, #4	/* unalign stack */
 	mov	r0, r1
 	ldmfd	sp!, {pc}
 END(__umodsi3)
 
 ENTRY_NP(__modsi3)
 	stmfd	sp!, {lr}
 	sub	sp, sp, #4	/* align stack */
 	bl	.L_divide
 	add	sp, sp, #4	/* unalign stack */
 	mov	r0, r1
 	ldmfd	sp!, {pc}
 
 .L_overflow:
 #if !defined(_KERNEL) && !defined(_STANDALONE)
 	mov	r0, #8			/* SIGFPE */
 	bl	PIC_SYM(_C_LABEL(raise), PLT)	/* raise it */
 	mov	r0, #0
 #else
 	/* XXX should cause a fatal error */
 	mvn	r0, #0
 #endif
 	RET
 END(__modsi3)
 
+ENTRY_NP(__udivsi3)
 #ifdef __ARM_EABI__
-ENTRY_NP(__aeabi_uidiv)
-ENTRY_NP(__aeabi_uidivmod)
+EENTRY_NP(__aeabi_uidiv)
+EENTRY_NP(__aeabi_uidivmod)
 #endif
-ENTRY_NP(__udivsi3)
 .L_udivide:				/* r0 = r0 / r1; r1 = r0 % r1 */
 	eor     r0, r1, r0 
 	eor     r1, r0, r1 
 	eor     r0, r1, r0 
 					/* r0 = r1 / r0; r1 = r1 % r0 */
 	cmp	r0, #1
 	bcc	.L_overflow
 	beq	.L_divide_l0
 	mov	ip, #0
 	movs	r1, r1
 	bpl	.L_divide_l1
 	orr	ip, ip, #0x20000000	/* ip bit 0x20000000 = -ve r1 */
 	movs	r1, r1, lsr #1
 	orrcs	ip, ip, #0x10000000	/* ip bit 0x10000000 = bit 0 of r1 */
 	b	.L_divide_l1
 
 .L_divide_l0:				/* r0 == 1 */
 	mov	r0, r1
 	mov	r1, #0
 	RET
 #ifdef __ARM_EABI__
-END(__aeabi_uidiv)
-END(__aeabi_uidivmod)
+EEND(__aeabi_uidiv)
+EEND(__aeabi_uidivmod)
 #endif
 END(__udivsi3)
 
+ENTRY_NP(__divsi3)
 #ifdef __ARM_EABI__
-ENTRY_NP(__aeabi_idiv)
-ENTRY_NP(__aeabi_idivmod)
+EENTRY_NP(__aeabi_idiv)
+EENTRY_NP(__aeabi_idivmod)
 #endif
-ENTRY_NP(__divsi3)
 .L_divide:				/* r0 = r0 / r1; r1 = r0 % r1 */
 	eor     r0, r1, r0 
 	eor     r1, r0, r1 
 	eor     r0, r1, r0 
 					/* r0 = r1 / r0; r1 = r1 % r0 */
 	cmp	r0, #1
 	bcc	.L_overflow
 	beq	.L_divide_l0
 	ands	ip, r0, #0x80000000
 	rsbmi	r0, r0, #0
 	ands	r2, r1, #0x80000000
 	eor	ip, ip, r2
 	rsbmi	r1, r1, #0
 	orr	ip, r2, ip, lsr #1	/* ip bit 0x40000000 = -ve division */
 					/* ip bit 0x80000000 = -ve remainder */
 
 .L_divide_l1:
 	mov	r2, #1
 	mov	r3, #0
 
 	/*
 	 * If the highest bit of the dividend is set, we have to be
 	 * careful when shifting the divisor. Test this. 
 	 */
 	movs	r1,r1
 	bpl	.L_old_code
 
 	/*
 	 * At this point, the highest bit of r1 is known to be set.
 	 * We abuse this below in the tst instructions.
 	 */
 	tst	r1, r0 /*, lsl #0 */
 	bmi	.L_divide_b1
 	tst	r1, r0, lsl #1
 	bmi	.L_divide_b2
 	tst	r1, r0, lsl #2
 	bmi	.L_divide_b3
 	tst	r1, r0, lsl #3
 	bmi	.L_divide_b4
 	tst	r1, r0, lsl #4
 	bmi	.L_divide_b5
 	tst	r1, r0, lsl #5
 	bmi	.L_divide_b6
 	tst	r1, r0, lsl #6
 	bmi	.L_divide_b7
 	tst	r1, r0, lsl #7
 	bmi	.L_divide_b8
 	tst	r1, r0, lsl #8
 	bmi	.L_divide_b9
 	tst	r1, r0, lsl #9
 	bmi	.L_divide_b10
 	tst	r1, r0, lsl #10
 	bmi	.L_divide_b11
 	tst	r1, r0, lsl #11
 	bmi	.L_divide_b12
 	tst	r1, r0, lsl #12
 	bmi	.L_divide_b13
 	tst	r1, r0, lsl #13
 	bmi	.L_divide_b14
 	tst	r1, r0, lsl #14
 	bmi	.L_divide_b15
 	tst	r1, r0, lsl #15
 	bmi	.L_divide_b16
 	tst	r1, r0, lsl #16
 	bmi	.L_divide_b17
 	tst	r1, r0, lsl #17
 	bmi	.L_divide_b18
 	tst	r1, r0, lsl #18
 	bmi	.L_divide_b19
 	tst	r1, r0, lsl #19
 	bmi	.L_divide_b20
 	tst	r1, r0, lsl #20
 	bmi	.L_divide_b21
 	tst	r1, r0, lsl #21
 	bmi	.L_divide_b22
 	tst	r1, r0, lsl #22
 	bmi	.L_divide_b23
 	tst	r1, r0, lsl #23
 	bmi	.L_divide_b24
 	tst	r1, r0, lsl #24
 	bmi	.L_divide_b25
 	tst	r1, r0, lsl #25
 	bmi	.L_divide_b26
 	tst	r1, r0, lsl #26
 	bmi	.L_divide_b27
 	tst	r1, r0, lsl #27
 	bmi	.L_divide_b28
 	tst	r1, r0, lsl #28
 	bmi	.L_divide_b29
 	tst	r1, r0, lsl #29
 	bmi	.L_divide_b30
 	tst	r1, r0, lsl #30
 	bmi	.L_divide_b31
 /*
  * instead of:
  *	tst	r1, r0, lsl #31
  *	bmi	.L_divide_b32
  */
 	b	.L_divide_b32
 
 .L_old_code:
 	cmp	r1, r0
 	bcc	.L_divide_b0
 	cmp	r1, r0, lsl #1
 	bcc	.L_divide_b1
 	cmp	r1, r0, lsl #2
 	bcc	.L_divide_b2
 	cmp	r1, r0, lsl #3
 	bcc	.L_divide_b3
 	cmp	r1, r0, lsl #4
 	bcc	.L_divide_b4
 	cmp	r1, r0, lsl #5
 	bcc	.L_divide_b5
 	cmp	r1, r0, lsl #6
 	bcc	.L_divide_b6
 	cmp	r1, r0, lsl #7
 	bcc	.L_divide_b7
 	cmp	r1, r0, lsl #8
 	bcc	.L_divide_b8
 	cmp	r1, r0, lsl #9
 	bcc	.L_divide_b9
 	cmp	r1, r0, lsl #10
 	bcc	.L_divide_b10
 	cmp	r1, r0, lsl #11
 	bcc	.L_divide_b11
 	cmp	r1, r0, lsl #12
 	bcc	.L_divide_b12
 	cmp	r1, r0, lsl #13
 	bcc	.L_divide_b13
 	cmp	r1, r0, lsl #14
 	bcc	.L_divide_b14
 	cmp	r1, r0, lsl #15
 	bcc	.L_divide_b15
 	cmp	r1, r0, lsl #16
 	bcc	.L_divide_b16
 	cmp	r1, r0, lsl #17
 	bcc	.L_divide_b17
 	cmp	r1, r0, lsl #18
 	bcc	.L_divide_b18
 	cmp	r1, r0, lsl #19
 	bcc	.L_divide_b19
 	cmp	r1, r0, lsl #20
 	bcc	.L_divide_b20
 	cmp	r1, r0, lsl #21
 	bcc	.L_divide_b21
 	cmp	r1, r0, lsl #22
 	bcc	.L_divide_b22
 	cmp	r1, r0, lsl #23
 	bcc	.L_divide_b23
 	cmp	r1, r0, lsl #24
 	bcc	.L_divide_b24
 	cmp	r1, r0, lsl #25
 	bcc	.L_divide_b25
 	cmp	r1, r0, lsl #26
 	bcc	.L_divide_b26
 	cmp	r1, r0, lsl #27
 	bcc	.L_divide_b27
 	cmp	r1, r0, lsl #28
 	bcc	.L_divide_b28
 	cmp	r1, r0, lsl #29
 	bcc	.L_divide_b29
 	cmp	r1, r0, lsl #30
 	bcc	.L_divide_b30
 .L_divide_b32:
 	cmp	r1, r0, lsl #31
 	subhs	r1, r1,r0, lsl #31
 	addhs	r3, r3,r2, lsl #31
 .L_divide_b31:
 	cmp	r1, r0, lsl #30
 	subhs	r1, r1,r0, lsl #30
 	addhs	r3, r3,r2, lsl #30
 .L_divide_b30:
 	cmp	r1, r0, lsl #29
 	subhs	r1, r1,r0, lsl #29
 	addhs	r3, r3,r2, lsl #29
 .L_divide_b29:
 	cmp	r1, r0, lsl #28
 	subhs	r1, r1,r0, lsl #28
 	addhs	r3, r3,r2, lsl #28
 .L_divide_b28:
 	cmp	r1, r0, lsl #27
 	subhs	r1, r1,r0, lsl #27
 	addhs	r3, r3,r2, lsl #27
 .L_divide_b27:
 	cmp	r1, r0, lsl #26
 	subhs	r1, r1,r0, lsl #26
 	addhs	r3, r3,r2, lsl #26
 .L_divide_b26:
 	cmp	r1, r0, lsl #25
 	subhs	r1, r1,r0, lsl #25
 	addhs	r3, r3,r2, lsl #25
 .L_divide_b25:
 	cmp	r1, r0, lsl #24
 	subhs	r1, r1,r0, lsl #24
 	addhs	r3, r3,r2, lsl #24
 .L_divide_b24:
 	cmp	r1, r0, lsl #23
 	subhs	r1, r1,r0, lsl #23
 	addhs	r3, r3,r2, lsl #23
 .L_divide_b23:
 	cmp	r1, r0, lsl #22
 	subhs	r1, r1,r0, lsl #22
 	addhs	r3, r3,r2, lsl #22
 .L_divide_b22:
 	cmp	r1, r0, lsl #21
 	subhs	r1, r1,r0, lsl #21
 	addhs	r3, r3,r2, lsl #21
 .L_divide_b21:
 	cmp	r1, r0, lsl #20
 	subhs	r1, r1,r0, lsl #20
 	addhs	r3, r3,r2, lsl #20
 .L_divide_b20:
 	cmp	r1, r0, lsl #19
 	subhs	r1, r1,r0, lsl #19
 	addhs	r3, r3,r2, lsl #19
 .L_divide_b19:
 	cmp	r1, r0, lsl #18
 	subhs	r1, r1,r0, lsl #18
 	addhs	r3, r3,r2, lsl #18
 .L_divide_b18:
 	cmp	r1, r0, lsl #17
 	subhs	r1, r1,r0, lsl #17
 	addhs	r3, r3,r2, lsl #17
 .L_divide_b17:
 	cmp	r1, r0, lsl #16
 	subhs	r1, r1,r0, lsl #16
 	addhs	r3, r3,r2, lsl #16
 .L_divide_b16:
 	cmp	r1, r0, lsl #15
 	subhs	r1, r1,r0, lsl #15
 	addhs	r3, r3,r2, lsl #15
 .L_divide_b15:
 	cmp	r1, r0, lsl #14
 	subhs	r1, r1,r0, lsl #14
 	addhs	r3, r3,r2, lsl #14
 .L_divide_b14:
 	cmp	r1, r0, lsl #13
 	subhs	r1, r1,r0, lsl #13
 	addhs	r3, r3,r2, lsl #13
 .L_divide_b13:
 	cmp	r1, r0, lsl #12
 	subhs	r1, r1,r0, lsl #12
 	addhs	r3, r3,r2, lsl #12
 .L_divide_b12:
 	cmp	r1, r0, lsl #11
 	subhs	r1, r1,r0, lsl #11
 	addhs	r3, r3,r2, lsl #11
 .L_divide_b11:
 	cmp	r1, r0, lsl #10
 	subhs	r1, r1,r0, lsl #10
 	addhs	r3, r3,r2, lsl #10
 .L_divide_b10:
 	cmp	r1, r0, lsl #9
 	subhs	r1, r1,r0, lsl #9
 	addhs	r3, r3,r2, lsl #9
 .L_divide_b9:
 	cmp	r1, r0, lsl #8
 	subhs	r1, r1,r0, lsl #8
 	addhs	r3, r3,r2, lsl #8
 .L_divide_b8:
 	cmp	r1, r0, lsl #7
 	subhs	r1, r1,r0, lsl #7
 	addhs	r3, r3,r2, lsl #7
 .L_divide_b7:
 	cmp	r1, r0, lsl #6
 	subhs	r1, r1,r0, lsl #6
 	addhs	r3, r3,r2, lsl #6
 .L_divide_b6:
 	cmp	r1, r0, lsl #5
 	subhs	r1, r1,r0, lsl #5
 	addhs	r3, r3,r2, lsl #5
 .L_divide_b5:
 	cmp	r1, r0, lsl #4
 	subhs	r1, r1,r0, lsl #4
 	addhs	r3, r3,r2, lsl #4
 .L_divide_b4:
 	cmp	r1, r0, lsl #3
 	subhs	r1, r1,r0, lsl #3
 	addhs	r3, r3,r2, lsl #3
 .L_divide_b3:
 	cmp	r1, r0, lsl #2
 	subhs	r1, r1,r0, lsl #2
 	addhs	r3, r3,r2, lsl #2
 .L_divide_b2:
 	cmp	r1, r0, lsl #1
 	subhs	r1, r1,r0, lsl #1
 	addhs	r3, r3,r2, lsl #1
 .L_divide_b1:
 	cmp	r1, r0
 	subhs	r1, r1, r0
 	addhs	r3, r3, r2
 .L_divide_b0:
 
 	tst	ip, #0x20000000
 	bne	.L_udivide_l1
 	mov	r0, r3
 	cmp	ip, #0
 	rsbmi	r1, r1, #0
 	movs	ip, ip, lsl #1
 	bicmi	r0, r0, #0x80000000	/* Fix incase we divided 0x80000000 */
 	rsbmi	r0, r0, #0
 	RET
 
 .L_udivide_l1:
 	tst	ip, #0x10000000
 	mov	r1, r1, lsl #1
 	orrne	r1, r1, #1
 	mov	r3, r3, lsl #1
 	cmp	r1, r0
 	subhs	r1, r1, r0
 	addhs	r3, r3, r2
 	mov	r0, r3
 	RET
 #ifdef __ARM_EABI__
-END(__aeabi_idiv)
-END(__aeabi_idivmod)
+EEND(__aeabi_idiv)
+EEND(__aeabi_idivmod)
 #endif
 END(__divsi3)
 
Index: stable/10
===================================================================
--- stable/10	(revision 269795)
+++ stable/10	(revision 269796)

Property changes on: stable/10
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /head:r269390