Index: user/attilio/vmc-playground/sys/arm/arm/pmap-v6.c
===================================================================
--- user/attilio/vmc-playground/sys/arm/arm/pmap-v6.c	(revision 247223)
+++ user/attilio/vmc-playground/sys/arm/arm/pmap-v6.c	(revision 247224)
@@ -1,3865 +1,3864 @@
 /* From: $NetBSD: pmap.c,v 1.148 2004/04/03 04:35:48 bsh Exp $ */
 /*-
  * Copyright 2011 Semihalf
  * Copyright 2004 Olivier Houchard.
  * Copyright 2003 Wasabi Systems, Inc.
  * All rights reserved.
  *
  * Written by Steve C. Woodford for Wasabi Systems, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *      This product includes software developed for the NetBSD Project by
  *      Wasabi Systems, Inc.
  * 4. The name of Wasabi Systems, Inc. may not be used to endorse
  *    or promote products derived from this software without specific prior
  *    written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  *
  * From: FreeBSD: src/sys/arm/arm/pmap.c,v 1.113 2009/07/24 13:50:29
  */
 
 /*-
  * Copyright (c) 2002-2003 Wasabi Systems, Inc.
  * Copyright (c) 2001 Richard Earnshaw
  * Copyright (c) 2001-2002 Christopher Gilbert
  * All rights reserved.
  *
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the company nor the name of the author may be used to
  *    endorse or promote products derived from this software without specific
  *    prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 /*-
  * Copyright (c) 1999 The NetBSD Foundation, Inc.
  * All rights reserved.
  *
  * This code is derived from software contributed to The NetBSD Foundation
  * by Charles M. Hannum.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
 /*-
  * Copyright (c) 1994-1998 Mark Brinicombe.
  * Copyright (c) 1994 Brini.
  * All rights reserved.
  *
  * This code is derived from software written for Brini by Mark Brinicombe
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *      This product includes software developed by Mark Brinicombe.
  * 4. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  *
  * RiscBSD kernel project
  *
  * pmap.c
  *
  * Machine dependant vm stuff
  *
  * Created      : 20/09/94
  */
 
 /*
  * Special compilation symbols
  * PMAP_DEBUG           - Build in pmap_debug_level code
  */
 /* Include header files */
 
 #include "opt_vm.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/proc.h>
 #include <sys/malloc.h>
 #include <sys/msgbuf.h>
 #include <sys/mutex.h>
 #include <sys/vmmeter.h>
 #include <sys/mman.h>
 #include <sys/rwlock.h>
 #include <sys/smp.h>
 #include <sys/sched.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/uma.h>
 #include <vm/pmap.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_extern.h>
 
 #include <machine/md_var.h>
 #include <machine/cpu.h>
 #include <machine/cpufunc.h>
 #include <machine/pcb.h>
 
 #ifdef DEBUG
 extern int last_fault_code;
 #endif
 
 #ifdef PMAP_DEBUG
 #define PDEBUG(_lev_,_stat_) \
         if (pmap_debug_level >= (_lev_)) \
                 ((_stat_))
 #define dprintf printf
 
 int pmap_debug_level = 0;
 #define PMAP_INLINE
 #else   /* PMAP_DEBUG */
 #define PDEBUG(_lev_,_stat_) /* Nothing */
 #define dprintf(x, arg...)
 #define PMAP_INLINE __inline
 #endif  /* PMAP_DEBUG */
 
 #ifdef ARM_L2_PIPT
 #define pmap_l2cache_wbinv_range(va, pa, size) cpu_l2cache_wbinv_range((pa), (size))
 #define pmap_l2cache_inv_range(va, pa, size) cpu_l2cache_inv_range((pa), (size))
 #else
 #define pmap_l2cache_wbinv_range(va, pa, size) cpu_l2cache_wbinv_range((va), (size))
 #define pmap_l2cache_inv_range(va, pa, size) cpu_l2cache_inv_range((va), (size))
 #endif
 
 extern struct pv_addr systempage;
 
 /*
  * Internal function prototypes
  */
 static void pmap_free_pv_entry (pv_entry_t);
 static pv_entry_t pmap_get_pv_entry(void);
 
 static void		pmap_enter_locked(pmap_t, vm_offset_t, vm_page_t,
     vm_prot_t, boolean_t, int);
 static vm_paddr_t	pmap_extract_locked(pmap_t pmap, vm_offset_t va);
 static void		pmap_alloc_l1(pmap_t);
 static void		pmap_free_l1(pmap_t);
 
 static int		pmap_clearbit(struct vm_page *, u_int);
 
 static struct l2_bucket *pmap_get_l2_bucket(pmap_t, vm_offset_t);
 static struct l2_bucket *pmap_alloc_l2_bucket(pmap_t, vm_offset_t);
 static void		pmap_free_l2_bucket(pmap_t, struct l2_bucket *, u_int);
 static vm_offset_t	kernel_pt_lookup(vm_paddr_t);
 
 static MALLOC_DEFINE(M_VMPMAP, "pmap", "PMAP L1");
 
 vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
 vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
 vm_offset_t pmap_curmaxkvaddr;
 vm_paddr_t kernel_l1pa;
 
 vm_offset_t kernel_vm_end = 0;
 
 struct pmap kernel_pmap_store;
 
 static pt_entry_t *csrc_pte, *cdst_pte;
 static vm_offset_t csrcp, cdstp;
 static struct mtx cmtx;
 
 static void		pmap_init_l1(struct l1_ttable *, pd_entry_t *);
 /*
  * These routines are called when the CPU type is identified to set up
  * the PTE prototypes, cache modes, etc.
  *
  * The variables are always here, just in case LKMs need to reference
  * them (though, they shouldn't).
  */
 static void pmap_set_prot(pt_entry_t *pte, vm_prot_t prot, uint8_t user);
 pt_entry_t	pte_l1_s_cache_mode;
 pt_entry_t	pte_l1_s_cache_mode_pt;
 
 pt_entry_t	pte_l2_l_cache_mode;
 pt_entry_t	pte_l2_l_cache_mode_pt;
 
 pt_entry_t	pte_l2_s_cache_mode;
 pt_entry_t	pte_l2_s_cache_mode_pt;
 
 struct msgbuf *msgbufp = 0;
 
 /*
  * Crashdump maps.
  */
 static caddr_t crashdumpmap;
 
 extern void bcopy_page(vm_offset_t, vm_offset_t);
 extern void bzero_page(vm_offset_t);
 
 char *_tmppt;
 
 /*
  * Metadata for L1 translation tables.
  */
 struct l1_ttable {
 	/* Entry on the L1 Table list */
 	SLIST_ENTRY(l1_ttable) l1_link;
 
 	/* Entry on the L1 Least Recently Used list */
 	TAILQ_ENTRY(l1_ttable) l1_lru;
 
 	/* Track how many domains are allocated from this L1 */
 	volatile u_int l1_domain_use_count;
 
 	/*
 	 * A free-list of domain numbers for this L1.
 	 * We avoid using ffs() and a bitmap to track domains since ffs()
 	 * is slow on ARM.
 	 */
 	u_int8_t l1_domain_first;
 	u_int8_t l1_domain_free[PMAP_DOMAINS];
 
 	/* Physical address of this L1 page table */
 	vm_paddr_t l1_physaddr;
 
 	/* KVA of this L1 page table */
 	pd_entry_t *l1_kva;
 };
 
 /*
  * Convert a virtual address into its L1 table index. That is, the
  * index used to locate the L2 descriptor table pointer in an L1 table.
  * This is basically used to index l1->l1_kva[].
  *
  * Each L2 descriptor table represents 1MB of VA space.
  */
 #define	L1_IDX(va)		(((vm_offset_t)(va)) >> L1_S_SHIFT)
 
 /*
  * L1 Page Tables are tracked using a Least Recently Used list.
  *  - New L1s are allocated from the HEAD.
  *  - Freed L1s are added to the TAIl.
  *  - Recently accessed L1s (where an 'access' is some change to one of
  *    the userland pmaps which owns this L1) are moved to the TAIL.
  */
 static TAILQ_HEAD(, l1_ttable) l1_lru_list;
 /*
  * A list of all L1 tables
  */
 static SLIST_HEAD(, l1_ttable) l1_list;
 static struct mtx l1_lru_lock;
 
 /*
  * The l2_dtable tracks L2_BUCKET_SIZE worth of L1 slots.
  *
  * This is normally 16MB worth L2 page descriptors for any given pmap.
  * Reference counts are maintained for L2 descriptors so they can be
  * freed when empty.
  */
 struct l2_dtable {
 	/* The number of L2 page descriptors allocated to this l2_dtable */
 	u_int l2_occupancy;
 
 	/* List of L2 page descriptors */
 	struct l2_bucket {
 		pt_entry_t *l2b_kva;	/* KVA of L2 Descriptor Table */
 		vm_paddr_t l2b_phys;	/* Physical address of same */
 		u_short l2b_l1idx;	/* This L2 table's L1 index */
 		u_short l2b_occupancy;	/* How many active descriptors */
 	} l2_bucket[L2_BUCKET_SIZE];
 };
 
 /* pmap_kenter_internal flags */
 #define KENTER_CACHE	0x1
 #define KENTER_USER	0x2
 
 /*
  * Given an L1 table index, calculate the corresponding l2_dtable index
  * and bucket index within the l2_dtable.
  */
 #define	L2_IDX(l1idx)		(((l1idx) >> L2_BUCKET_LOG2) & \
 				 (L2_SIZE - 1))
 #define	L2_BUCKET(l1idx)	((l1idx) & (L2_BUCKET_SIZE - 1))
 
 /*
  * Given a virtual address, this macro returns the
  * virtual address required to drop into the next L2 bucket.
  */
 #define	L2_NEXT_BUCKET(va)	(((va) & L1_S_FRAME) + L1_S_SIZE)
 
 /*
  * We try to map the page tables write-through, if possible.  However, not
  * all CPUs have a write-through cache mode, so on those we have to sync
  * the cache when we frob page tables.
  *
  * We try to evaluate this at compile time, if possible.  However, it's
  * not always possible to do that, hence this run-time var.
  */
 int	pmap_needs_pte_sync;
 
 /*
  * Macro to determine if a mapping might be resident in the
  * instruction cache and/or TLB
  */
 #define	PV_BEEN_EXECD(f)  (((f) & (PVF_REF | PVF_EXEC)) == (PVF_REF | PVF_EXEC))
 
 /*
  * Macro to determine if a mapping might be resident in the
  * data cache and/or TLB
  */
 #define	PV_BEEN_REFD(f)   (((f) & PVF_REF) != 0)
 
 #ifndef PMAP_SHPGPERPROC
 #define PMAP_SHPGPERPROC 200
 #endif
 
 #define pmap_is_current(pm)	((pm) == pmap_kernel() || \
             curproc->p_vmspace->vm_map.pmap == (pm))
 static uma_zone_t pvzone = NULL;
 uma_zone_t l2zone;
 static uma_zone_t l2table_zone;
 static vm_offset_t pmap_kernel_l2dtable_kva;
 static vm_offset_t pmap_kernel_l2ptp_kva;
 static vm_paddr_t pmap_kernel_l2ptp_phys;
-static struct vm_object pvzone_obj;
 static int pv_entry_count=0, pv_entry_max=0, pv_entry_high_water=0;
 static struct rwlock pvh_global_lock;
 
 int l1_mem_types[] = {
 	ARM_L1S_STRONG_ORD,
 	ARM_L1S_DEVICE_NOSHARE,
 	ARM_L1S_DEVICE_SHARE,
 	ARM_L1S_NRML_NOCACHE,
 	ARM_L1S_NRML_IWT_OWT,
 	ARM_L1S_NRML_IWB_OWB,
 	ARM_L1S_NRML_IWBA_OWBA
 };
 
 int l2l_mem_types[] = {
 	ARM_L2L_STRONG_ORD,
 	ARM_L2L_DEVICE_NOSHARE,
 	ARM_L2L_DEVICE_SHARE,
 	ARM_L2L_NRML_NOCACHE,
 	ARM_L2L_NRML_IWT_OWT,
 	ARM_L2L_NRML_IWB_OWB,
 	ARM_L2L_NRML_IWBA_OWBA
 };
 
 int l2s_mem_types[] = {
 	ARM_L2S_STRONG_ORD,
 	ARM_L2S_DEVICE_NOSHARE,
 	ARM_L2S_DEVICE_SHARE,
 	ARM_L2S_NRML_NOCACHE,
 	ARM_L2S_NRML_IWT_OWT,
 	ARM_L2S_NRML_IWB_OWB,
 	ARM_L2S_NRML_IWBA_OWBA
 };
 
 /*
  * This list exists for the benefit of pmap_map_chunk().  It keeps track
  * of the kernel L2 tables during bootstrap, so that pmap_map_chunk() can
  * find them as necessary.
  *
  * Note that the data on this list MUST remain valid after initarm() returns,
  * as pmap_bootstrap() uses it to contruct L2 table metadata.
  */
 SLIST_HEAD(, pv_addr) kernel_pt_list = SLIST_HEAD_INITIALIZER(kernel_pt_list);
 
 static void
 pmap_init_l1(struct l1_ttable *l1, pd_entry_t *l1pt)
 {
 	int i;
 
 	l1->l1_kva = l1pt;
 	l1->l1_domain_use_count = 0;
 	l1->l1_domain_first = 0;
 
 	for (i = 0; i < PMAP_DOMAINS; i++)
 		l1->l1_domain_free[i] = i + 1;
 
 	/*
 	 * Copy the kernel's L1 entries to each new L1.
 	 */
 	if (l1pt != pmap_kernel()->pm_l1->l1_kva)
 		memcpy(l1pt, pmap_kernel()->pm_l1->l1_kva, L1_TABLE_SIZE);
 
 	if ((l1->l1_physaddr = pmap_extract(pmap_kernel(), (vm_offset_t)l1pt)) == 0)
 		panic("pmap_init_l1: can't get PA of L1 at %p", l1pt);
 	SLIST_INSERT_HEAD(&l1_list, l1, l1_link);
 	TAILQ_INSERT_TAIL(&l1_lru_list, l1, l1_lru);
 }
 
 static vm_offset_t
 kernel_pt_lookup(vm_paddr_t pa)
 {
 	struct pv_addr *pv;
 
 	SLIST_FOREACH(pv, &kernel_pt_list, pv_list) {
 		if (pv->pv_pa == pa)
 			return (pv->pv_va);
 	}
 	return (0);
 }
 
 void
 pmap_pte_init_mmu_v6(void)
 {
 
 	if (PTE_PAGETABLE >= 3)
 		pmap_needs_pte_sync = 1;
 	pte_l1_s_cache_mode = l1_mem_types[PTE_CACHE];
 	pte_l2_l_cache_mode = l2l_mem_types[PTE_CACHE];
 	pte_l2_s_cache_mode = l2s_mem_types[PTE_CACHE];
 
 	pte_l1_s_cache_mode_pt = l1_mem_types[PTE_PAGETABLE];
 	pte_l2_l_cache_mode_pt = l2l_mem_types[PTE_PAGETABLE];
 	pte_l2_s_cache_mode_pt = l2s_mem_types[PTE_PAGETABLE];
 
 }
 
 /*
  * Allocate an L1 translation table for the specified pmap.
  * This is called at pmap creation time.
  */
 static void
 pmap_alloc_l1(pmap_t pm)
 {
 	struct l1_ttable *l1;
 	u_int8_t domain;
 
 	/*
 	 * Remove the L1 at the head of the LRU list
 	 */
 	mtx_lock(&l1_lru_lock);
 	l1 = TAILQ_FIRST(&l1_lru_list);
 	TAILQ_REMOVE(&l1_lru_list, l1, l1_lru);
 
 	/*
 	 * Pick the first available domain number, and update
 	 * the link to the next number.
 	 */
 	domain = l1->l1_domain_first;
 	l1->l1_domain_first = l1->l1_domain_free[domain];
 
 	/*
 	 * If there are still free domain numbers in this L1,
 	 * put it back on the TAIL of the LRU list.
 	 */
 	if (++l1->l1_domain_use_count < PMAP_DOMAINS)
 		TAILQ_INSERT_TAIL(&l1_lru_list, l1, l1_lru);
 
 	mtx_unlock(&l1_lru_lock);
 
 	/*
 	 * Fix up the relevant bits in the pmap structure
 	 */
 	pm->pm_l1 = l1;
 	pm->pm_domain = domain + 1;
 }
 
 /*
  * Free an L1 translation table.
  * This is called at pmap destruction time.
  */
 static void
 pmap_free_l1(pmap_t pm)
 {
 	struct l1_ttable *l1 = pm->pm_l1;
 
 	mtx_lock(&l1_lru_lock);
 
 	/*
 	 * If this L1 is currently on the LRU list, remove it.
 	 */
 	if (l1->l1_domain_use_count < PMAP_DOMAINS)
 		TAILQ_REMOVE(&l1_lru_list, l1, l1_lru);
 
 	/*
 	 * Free up the domain number which was allocated to the pmap
 	 */
 	l1->l1_domain_free[pm->pm_domain - 1] = l1->l1_domain_first;
 	l1->l1_domain_first = pm->pm_domain - 1;
 	l1->l1_domain_use_count--;
 
 	/*
 	 * The L1 now must have at least 1 free domain, so add
 	 * it back to the LRU list. If the use count is zero,
 	 * put it at the head of the list, otherwise it goes
 	 * to the tail.
 	 */
 	if (l1->l1_domain_use_count == 0) {
 		TAILQ_INSERT_HEAD(&l1_lru_list, l1, l1_lru);
 	}	else
 		TAILQ_INSERT_TAIL(&l1_lru_list, l1, l1_lru);
 
 	mtx_unlock(&l1_lru_lock);
 }
 
 /*
  * Returns a pointer to the L2 bucket associated with the specified pmap
  * and VA, or NULL if no L2 bucket exists for the address.
  */
 static PMAP_INLINE struct l2_bucket *
 pmap_get_l2_bucket(pmap_t pm, vm_offset_t va)
 {
 	struct l2_dtable *l2;
 	struct l2_bucket *l2b;
 	u_short l1idx;
 
 	l1idx = L1_IDX(va);
 
 	if ((l2 = pm->pm_l2[L2_IDX(l1idx)]) == NULL ||
 	    (l2b = &l2->l2_bucket[L2_BUCKET(l1idx)])->l2b_kva == NULL)
 		return (NULL);
 
 	return (l2b);
 }
 
 /*
  * Returns a pointer to the L2 bucket associated with the specified pmap
  * and VA.
  *
  * If no L2 bucket exists, perform the necessary allocations to put an L2
  * bucket/page table in place.
  *
  * Note that if a new L2 bucket/page was allocated, the caller *must*
  * increment the bucket occupancy counter appropriately *before*
  * releasing the pmap's lock to ensure no other thread or cpu deallocates
  * the bucket/page in the meantime.
  */
 static struct l2_bucket *
 pmap_alloc_l2_bucket(pmap_t pm, vm_offset_t va)
 {
 	struct l2_dtable *l2;
 	struct l2_bucket *l2b;
 	u_short l1idx;
 
 	l1idx = L1_IDX(va);
 
 	PMAP_ASSERT_LOCKED(pm);
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	if ((l2 = pm->pm_l2[L2_IDX(l1idx)]) == NULL) {
 		/*
 		 * No mapping at this address, as there is
 		 * no entry in the L1 table.
 		 * Need to allocate a new l2_dtable.
 		 */
 		PMAP_UNLOCK(pm);
 		rw_wunlock(&pvh_global_lock);
 		if ((l2 = uma_zalloc(l2table_zone, M_NOWAIT)) == NULL) {
 			rw_wlock(&pvh_global_lock);
 			PMAP_LOCK(pm);
 			return (NULL);
 		}
 		rw_wlock(&pvh_global_lock);
 		PMAP_LOCK(pm);
 		if (pm->pm_l2[L2_IDX(l1idx)] != NULL) {
 			/*
 			 * Someone already allocated the l2_dtable while
 			 * we were doing the same.
 			 */
 			uma_zfree(l2table_zone, l2);
 			l2 = pm->pm_l2[L2_IDX(l1idx)];
 		} else {
 			bzero(l2, sizeof(*l2));
 			/*
 			 * Link it into the parent pmap
 			 */
 			pm->pm_l2[L2_IDX(l1idx)] = l2;
 		}
 	}
 
 	l2b = &l2->l2_bucket[L2_BUCKET(l1idx)];
 
 	/*
 	 * Fetch pointer to the L2 page table associated with the address.
 	 */
 	if (l2b->l2b_kva == NULL) {
 		pt_entry_t *ptep;
 
 		/*
 		 * No L2 page table has been allocated. Chances are, this
 		 * is because we just allocated the l2_dtable, above.
 		 */
 		PMAP_UNLOCK(pm);
 		rw_wunlock(&pvh_global_lock);
 		ptep = uma_zalloc(l2zone, M_NOWAIT);
 		rw_wlock(&pvh_global_lock);
 		PMAP_LOCK(pm);
 		if (l2b->l2b_kva != 0) {
 			/* We lost the race. */
 			uma_zfree(l2zone, ptep);
 			return (l2b);
 		}
 		l2b->l2b_phys = vtophys(ptep);
 		if (ptep == NULL) {
 			/*
 			 * Oops, no more L2 page tables available at this
 			 * time. We may need to deallocate the l2_dtable
 			 * if we allocated a new one above.
 			 */
 			if (l2->l2_occupancy == 0) {
 				pm->pm_l2[L2_IDX(l1idx)] = NULL;
 				uma_zfree(l2table_zone, l2);
 			}
 			return (NULL);
 		}
 
 		l2->l2_occupancy++;
 		l2b->l2b_kva = ptep;
 		l2b->l2b_l1idx = l1idx;
 	}
 
 	return (l2b);
 }
 
 static PMAP_INLINE void
 pmap_free_l2_ptp(pt_entry_t *l2)
 {
 	uma_zfree(l2zone, l2);
 }
 /*
  * One or more mappings in the specified L2 descriptor table have just been
  * invalidated.
  *
  * Garbage collect the metadata and descriptor table itself if necessary.
  *
  * The pmap lock must be acquired when this is called (not necessary
  * for the kernel pmap).
  */
 static void
 pmap_free_l2_bucket(pmap_t pm, struct l2_bucket *l2b, u_int count)
 {
 	struct l2_dtable *l2;
 	pd_entry_t *pl1pd, l1pd;
 	pt_entry_t *ptep;
 	u_short l1idx;
 
 
 	/*
 	 * Update the bucket's reference count according to how many
 	 * PTEs the caller has just invalidated.
 	 */
 	l2b->l2b_occupancy -= count;
 
 	/*
 	 * Note:
 	 *
 	 * Level 2 page tables allocated to the kernel pmap are never freed
 	 * as that would require checking all Level 1 page tables and
 	 * removing any references to the Level 2 page table. See also the
 	 * comment elsewhere about never freeing bootstrap L2 descriptors.
 	 *
 	 * We make do with just invalidating the mapping in the L2 table.
 	 *
 	 * This isn't really a big deal in practice and, in fact, leads
 	 * to a performance win over time as we don't need to continually
 	 * alloc/free.
 	 */
 	if (l2b->l2b_occupancy > 0 || pm == pmap_kernel())
 		return;
 
 	/*
 	 * There are no more valid mappings in this level 2 page table.
 	 * Go ahead and NULL-out the pointer in the bucket, then
 	 * free the page table.
 	 */
 	l1idx = l2b->l2b_l1idx;
 	ptep = l2b->l2b_kva;
 	l2b->l2b_kva = NULL;
 
 	pl1pd = &pm->pm_l1->l1_kva[l1idx];
 
 	/*
 	 * If the L1 slot matches the pmap's domain
 	 * number, then invalidate it.
 	 */
 	l1pd = *pl1pd & (L1_TYPE_MASK | L1_C_DOM_MASK);
 	if (l1pd == (L1_C_DOM(pm->pm_domain) | L1_TYPE_C)) {
 		*pl1pd = 0;
 		PTE_SYNC(pl1pd);
 	}
 
 	/*
 	 * Release the L2 descriptor table back to the pool cache.
 	 */
 	pmap_free_l2_ptp(ptep);
 
 	/*
 	 * Update the reference count in the associated l2_dtable
 	 */
 	l2 = pm->pm_l2[L2_IDX(l1idx)];
 	if (--l2->l2_occupancy > 0)
 		return;
 
 	/*
 	 * There are no more valid mappings in any of the Level 1
 	 * slots managed by this l2_dtable. Go ahead and NULL-out
 	 * the pointer in the parent pmap and free the l2_dtable.
 	 */
 	pm->pm_l2[L2_IDX(l1idx)] = NULL;
 	uma_zfree(l2table_zone, l2);
 }
 
 /*
  * Pool cache constructors for L2 descriptor tables, metadata and pmap
  * structures.
  */
 static int
 pmap_l2ptp_ctor(void *mem, int size, void *arg, int flags)
 {
 	struct l2_bucket *l2b;
 	pt_entry_t *ptep, pte;
 	vm_offset_t va = (vm_offset_t)mem & ~PAGE_MASK;
 
 	/*
 	 * The mappings for these page tables were initially made using
 	 * pmap_kenter() by the pool subsystem. Therefore, the cache-
 	 * mode will not be right for page table mappings. To avoid
 	 * polluting the pmap_kenter() code with a special case for
 	 * page tables, we simply fix up the cache-mode here if it's not
 	 * correct.
 	 */
 	l2b = pmap_get_l2_bucket(pmap_kernel(), va);
 	ptep = &l2b->l2b_kva[l2pte_index(va)];
 	pte = *ptep;
 
 	cpu_idcache_wbinv_range(va, PAGE_SIZE);
 	pmap_l2cache_wbinv_range(va, pte & L2_S_FRAME, PAGE_SIZE);
 	if ((pte & L2_S_CACHE_MASK) != pte_l2_s_cache_mode_pt) {
 		/*
 		 * Page tables must have the cache-mode set to
 		 * Write-Thru.
 		 */
 		*ptep = (pte & ~L2_S_CACHE_MASK) | pte_l2_s_cache_mode_pt;
 		PTE_SYNC(ptep);
 		cpu_tlb_flushD_SE(va);
 		cpu_cpwait();
 	}
 
 	memset(mem, 0, L2_TABLE_SIZE_REAL);
 	return (0);
 }
 
 /*
  * Modify pte bits for all ptes corresponding to the given physical address.
  * We use `maskbits' rather than `clearbits' because we're always passing
  * constants and the latter would require an extra inversion at run-time.
  */
 static int
 pmap_clearbit(struct vm_page *pg, u_int maskbits)
 {
 	struct l2_bucket *l2b;
 	struct pv_entry *pv;
 	pt_entry_t *ptep, npte, opte;
 	pmap_t pm;
 	vm_offset_t va;
 	u_int oflags;
 	int count = 0;
 
 	rw_wlock(&pvh_global_lock);
 
 	if (maskbits & PVF_WRITE)
 		maskbits |= PVF_MOD;
 	/*
 	 * Clear saved attributes (modify, reference)
 	 */
 	pg->md.pvh_attrs &= ~(maskbits & (PVF_MOD | PVF_REF));
 
 	if (TAILQ_EMPTY(&pg->md.pv_list)) {
 		rw_wunlock(&pvh_global_lock);
 		return (0);
 	}
 
 	/*
 	 * Loop over all current mappings setting/clearing as appropos
 	 */
 	TAILQ_FOREACH(pv, &pg->md.pv_list, pv_list) {
 		va = pv->pv_va;
 		pm = pv->pv_pmap;
 		oflags = pv->pv_flags;
 		pv->pv_flags &= ~maskbits;
 
 		PMAP_LOCK(pm);
 
 		l2b = pmap_get_l2_bucket(pm, va);
 
 		ptep = &l2b->l2b_kva[l2pte_index(va)];
 		npte = opte = *ptep;
 
 		if ((maskbits & (PVF_WRITE|PVF_MOD)) && L2_S_WRITABLE(opte)) {
 			vm_page_dirty(pg);
 
 			/* make the pte read only */
 			npte |= L2_APX;
 		}
 
 		if (maskbits & PVF_REF) {
 			/*
 			 * Make the PTE invalid so that we will take a
 			 * page fault the next time the mapping is
 			 * referenced.
 			 */
 			npte &= ~L2_TYPE_MASK;
 			npte |= L2_TYPE_INV;
 		}
 
 		CTR4(KTR_PMAP,"clearbit: pmap:%p bits:%x pte:%x->%x",
 		    pm, maskbits, opte, npte);
 		if (npte != opte) {
 			count++;
 			*ptep = npte;
 			PTE_SYNC(ptep);
 			/* Flush the TLB entry if a current pmap. */
 			if (PV_BEEN_EXECD(oflags))
 				cpu_tlb_flushID_SE(pv->pv_va);
 			else if (PV_BEEN_REFD(oflags))
 				cpu_tlb_flushD_SE(pv->pv_va);
 		}
 
 		PMAP_UNLOCK(pm);
 
 	}
 
 	if (maskbits & PVF_WRITE)
 		vm_page_aflag_clear(pg, PGA_WRITEABLE);
 	rw_wunlock(&pvh_global_lock);
 	return (count);
 }
 
 /*
  * main pv_entry manipulation functions:
  *   pmap_enter_pv: enter a mapping onto a vm_page list
  *   pmap_remove_pv: remove a mappiing from a vm_page list
  *
  * NOTE: pmap_enter_pv expects to lock the pvh itself
  *       pmap_remove_pv expects the caller to lock the pvh before calling
  */
 
 /*
  * pmap_enter_pv: enter a mapping onto a vm_page's PV list
  *
  * => caller should hold the proper lock on pvh_global_lock
  * => caller should have pmap locked
  * => we will (someday) gain the lock on the vm_page's PV list
  * => caller should adjust ptp's wire_count before calling
  * => caller should not adjust pmap's wire_count
  */
 static void
 pmap_enter_pv(struct vm_page *pg, struct pv_entry *pve, pmap_t pm,
     vm_offset_t va, u_int flags)
 {
 
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 
 	PMAP_ASSERT_LOCKED(pm);
 	pve->pv_pmap = pm;
 	pve->pv_va = va;
 	pve->pv_flags = flags;
 
 	TAILQ_INSERT_HEAD(&pg->md.pv_list, pve, pv_list);
 	TAILQ_INSERT_HEAD(&pm->pm_pvlist, pve, pv_plist);
 	pg->md.pvh_attrs |= flags & (PVF_REF | PVF_MOD);
 	if (pve->pv_flags & PVF_WIRED)
 		++pm->pm_stats.wired_count;
 	vm_page_aflag_set(pg, PGA_REFERENCED);
 }
 
 /*
  *
  * pmap_find_pv: Find a pv entry
  *
  * => caller should hold lock on vm_page
  */
 static PMAP_INLINE struct pv_entry *
 pmap_find_pv(struct vm_page *pg, pmap_t pm, vm_offset_t va)
 {
 	struct pv_entry *pv;
 
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	TAILQ_FOREACH(pv, &pg->md.pv_list, pv_list)
 	    if (pm == pv->pv_pmap && va == pv->pv_va)
 		    break;
 	return (pv);
 }
 
 /*
  * vector_page_setprot:
  *
  *	Manipulate the protection of the vector page.
  */
 void
 vector_page_setprot(int prot)
 {
 	struct l2_bucket *l2b;
 	pt_entry_t *ptep;
 
 	l2b = pmap_get_l2_bucket(pmap_kernel(), vector_page);
 
 	ptep = &l2b->l2b_kva[l2pte_index(vector_page)];
 
 	pmap_set_prot(ptep, prot|VM_PROT_EXECUTE, 0);
 
 	cpu_tlb_flushD_SE(vector_page);
 	cpu_cpwait();
 }
 
 static void
 pmap_set_prot(pt_entry_t *ptep, vm_prot_t prot, uint8_t user)
 {
 
 	*ptep &= ~L2_S_PROT_MASK;
 
 	if (!(prot & VM_PROT_EXECUTE))
 		*ptep |= L2_XN;
 
 	*ptep |= L2_S_PROT_R;
 
 	if (user)
 		*ptep |= L2_S_PROT_U;
 
 	if (prot & VM_PROT_WRITE)
 		*ptep &= ~(L2_APX);
 }
 
 /*
  * pmap_remove_pv: try to remove a mapping from a pv_list
  *
  * => caller should hold proper lock on pmap_main_lock
  * => pmap should be locked
  * => caller should hold lock on vm_page [so that attrs can be adjusted]
  * => caller should adjust ptp's wire_count and free PTP if needed
  * => caller should NOT adjust pmap's wire_count
  * => we return the removed pve
  */
 
 static void
 pmap_nuke_pv(struct vm_page *pg, pmap_t pm, struct pv_entry *pve)
 {
 
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	PMAP_ASSERT_LOCKED(pm);
 
 	TAILQ_REMOVE(&pg->md.pv_list, pve, pv_list);
 	TAILQ_REMOVE(&pm->pm_pvlist, pve, pv_plist);
 
 	if (pve->pv_flags & PVF_WIRED)
 		--pm->pm_stats.wired_count;
 
 	if (pg->md.pvh_attrs & PVF_MOD)
 		vm_page_dirty(pg);
 
 	if (TAILQ_FIRST(&pg->md.pv_list) == NULL)
 		pg->md.pvh_attrs &= ~PVF_REF;
 	else
 		vm_page_aflag_set(pg, PGA_REFERENCED);
 
 	if (pve->pv_flags & PVF_WRITE) {
 		TAILQ_FOREACH(pve, &pg->md.pv_list, pv_list)
 		    if (pve->pv_flags & PVF_WRITE)
 			    break;
 		if (!pve) {
 			pg->md.pvh_attrs &= ~PVF_MOD;
 			vm_page_aflag_clear(pg, PGA_WRITEABLE);
 		}
 	}
 }
 
 static struct pv_entry *
 pmap_remove_pv(struct vm_page *pg, pmap_t pm, vm_offset_t va)
 {
 	struct pv_entry *pve;
 
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	pve = TAILQ_FIRST(&pg->md.pv_list);
 
 	while (pve) {
 		if (pve->pv_pmap == pm && pve->pv_va == va) {	/* match? */
 			pmap_nuke_pv(pg, pm, pve);
 			break;
 		}
 		pve = TAILQ_NEXT(pve, pv_list);
 	}
 
 	return(pve);				/* return removed pve */
 }
 
 /*
  *
  * pmap_modify_pv: Update pv flags
  *
  * => caller should hold lock on vm_page [so that attrs can be adjusted]
  * => caller should NOT adjust pmap's wire_count
  * => we return the old flags
  *
  * Modify a physical-virtual mapping in the pv table
  */
 static u_int
 pmap_modify_pv(struct vm_page *pg, pmap_t pm, vm_offset_t va,
     u_int clr_mask, u_int set_mask)
 {
 	struct pv_entry *npv;
 	u_int flags, oflags;
 
 	PMAP_ASSERT_LOCKED(pm);
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	if ((npv = pmap_find_pv(pg, pm, va)) == NULL)
 		return (0);
 
 	/*
 	 * There is at least one VA mapping this page.
 	 */
 
 	if (clr_mask & (PVF_REF | PVF_MOD))
 		pg->md.pvh_attrs |= set_mask & (PVF_REF | PVF_MOD);
 
 	oflags = npv->pv_flags;
 	npv->pv_flags = flags = (oflags & ~clr_mask) | set_mask;
 
 	if ((flags ^ oflags) & PVF_WIRED) {
 		if (flags & PVF_WIRED)
 			++pm->pm_stats.wired_count;
 		else
 			--pm->pm_stats.wired_count;
 	}
 	if ((oflags & PVF_WRITE) && !(flags & PVF_WRITE)) {
 		TAILQ_FOREACH(npv, &pg->md.pv_list, pv_list) {
 			if (npv->pv_flags & PVF_WRITE)
 				break;
 		}
 		if (!npv) {
 			pg->md.pvh_attrs &= ~PVF_MOD;
 			vm_page_aflag_clear(pg, PGA_WRITEABLE);
 		}
 	}
 
 	return (oflags);
 }
 
 /* Function to set the debug level of the pmap code */
 #ifdef PMAP_DEBUG
 void
 pmap_debug(int level)
 {
 	pmap_debug_level = level;
 	dprintf("pmap_debug: level=%d\n", pmap_debug_level);
 }
 #endif  /* PMAP_DEBUG */
 
 void
 pmap_pinit0(struct pmap *pmap)
 {
 	PDEBUG(1, printf("pmap_pinit0: pmap = %08x\n", (u_int32_t) pmap));
 
 	dprintf("pmap_pinit0: pmap = %08x, pm_pdir = %08x\n",
 		(u_int32_t) pmap, (u_int32_t) pmap->pm_pdir);
 	bcopy(kernel_pmap, pmap, sizeof(*pmap));
 	bzero(&pmap->pm_mtx, sizeof(pmap->pm_mtx));
 	PMAP_LOCK_INIT(pmap);
 }
 
 /*
  *	Initialize a vm_page's machine-dependent fields.
  */
 void
 pmap_page_init(vm_page_t m)
 {
 
 	TAILQ_INIT(&m->md.pv_list);
 	m->md.pv_memattr = VM_MEMATTR_DEFAULT;
 }
 
 /*
  *      Initialize the pmap module.
  *      Called by vm_init, to initialize any structures that the pmap
  *      system needs to map virtual memory.
  */
 void
 pmap_init(void)
 {
 	int shpgperproc = PMAP_SHPGPERPROC;
 
 	PDEBUG(1, printf("pmap_init: phys_start = %08x\n", PHYSADDR));
 
 	l2zone = uma_zcreate("L2 Table", L2_TABLE_SIZE_REAL, pmap_l2ptp_ctor,
 	    NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE);
 	l2table_zone = uma_zcreate("L2 Table", sizeof(struct l2_dtable), NULL,
 	    NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE);
 
 	/*
 	 * Initialize the PV entry allocator.
 	 */
 	pvzone = uma_zcreate("PV ENTRY", sizeof (struct pv_entry), NULL, NULL,
 	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE);
 	TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
 	pv_entry_max = shpgperproc * maxproc + cnt.v_page_count;
-	uma_zone_set_obj(pvzone, &pvzone_obj, pv_entry_max);
+	uma_zone_reserve_kva(pvzone, pv_entry_max);
 	pv_entry_high_water = 9 * (pv_entry_max / 10);
 
 	/*
 	 * Now it is safe to enable pv_table recording.
 	 */
 	PDEBUG(1, printf("pmap_init: done!\n"));
 }
 
 int
 pmap_fault_fixup(pmap_t pm, vm_offset_t va, vm_prot_t ftype, int user)
 {
 	struct l2_dtable *l2;
 	struct l2_bucket *l2b;
 	pd_entry_t *pl1pd, l1pd;
 	pt_entry_t *ptep, pte;
 	vm_paddr_t pa;
 	u_int l1idx;
 	int rv = 0;
 
 	l1idx = L1_IDX(va);
 	rw_wlock(&pvh_global_lock);
 	PMAP_LOCK(pm);
 
 	/*
 	 * If there is no l2_dtable for this address, then the process
 	 * has no business accessing it.
 	 *
 	 * Note: This will catch userland processes trying to access
 	 * kernel addresses.
 	 */
 	l2 = pm->pm_l2[L2_IDX(l1idx)];
 	if (l2 == NULL)
 		goto out;
 
 	/*
 	 * Likewise if there is no L2 descriptor table
 	 */
 	l2b = &l2->l2_bucket[L2_BUCKET(l1idx)];
 	if (l2b->l2b_kva == NULL)
 		goto out;
 
 	/*
 	 * Check the PTE itself.
 	 */
 	ptep = &l2b->l2b_kva[l2pte_index(va)];
 	pte = *ptep;
 	if (pte == 0)
 		goto out;
 
 	/*
 	 * Catch a userland access to the vector page mapped at 0x0
 	 */
 	if (user && ((pte & L2_S_PROT_MASK) == L2_S_PROT_U))
 		goto out;
 	if (va == vector_page)
 		goto out;
 
 	pa = l2pte_pa(pte);
 	CTR5(KTR_PMAP, "pmap_fault_fix: pmap:%p va:%x pte:0x%x ftype:%x user:%x",
 	    pm, va, pte, ftype, user);
 	if ((ftype & VM_PROT_WRITE) && !(L2_S_WRITABLE(pte))) {
 		/*
 		 * This looks like a good candidate for "page modified"
 		 * emulation...
 		 */
 		struct pv_entry *pv;
 		struct vm_page *pg;
 
 		/* Extract the physical address of the page */
 		if ((pg = PHYS_TO_VM_PAGE(pa)) == NULL) {
 			goto out;
 		}
 		/* Get the current flags for this page. */
 
 		pv = pmap_find_pv(pg, pm, va);
 		if (pv == NULL) {
 			goto out;
 		}
 
 		/*
 		 * Do the flags say this page is writable? If not then it
 		 * is a genuine write fault. If yes then the write fault is
 		 * our fault as we did not reflect the write access in the
 		 * PTE. Now we know a write has occurred we can correct this
 		 * and also set the modified bit
 		 */
 		if ((pv->pv_flags & PVF_WRITE) == 0) {
 			goto out;
 		}
 		pg->md.pvh_attrs |= PVF_REF | PVF_MOD;
 		vm_page_dirty(pg);
 		pv->pv_flags |= PVF_REF | PVF_MOD;
 
 		/* Re-enable write permissions for the page */
 		*ptep = (pte & ~L2_TYPE_MASK) | L2_S_PROTO;
 		pmap_set_prot(ptep, VM_PROT_WRITE, *ptep & L2_S_PROT_U);
 		CTR1(KTR_PMAP, "pmap_fault_fix: new pte:0x%x", pte);
 		PTE_SYNC(ptep);
 		rv = 1;
 	} else if ((pte & L2_TYPE_MASK) == L2_TYPE_INV) {
 		/*
 		 * This looks like a good candidate for "page referenced"
 		 * emulation.
 		 */
 		struct pv_entry *pv;
 		struct vm_page *pg;
 
 		/* Extract the physical address of the page */
 		if ((pg = PHYS_TO_VM_PAGE(pa)) == NULL)
 			goto out;
 		/* Get the current flags for this page. */
 
 		pv = pmap_find_pv(pg, pm, va);
 		if (pv == NULL)
 			goto out;
 
 		pg->md.pvh_attrs |= PVF_REF;
 		pv->pv_flags |= PVF_REF;
 
 
 		*ptep = (pte & ~L2_TYPE_MASK) | L2_S_PROTO;
 		PTE_SYNC(ptep);
 		rv = 1;
 	}
 
 	/*
 	 * We know there is a valid mapping here, so simply
 	 * fix up the L1 if necessary.
 	 */
 	pl1pd = &pm->pm_l1->l1_kva[l1idx];
 	l1pd = l2b->l2b_phys | L1_C_DOM(pm->pm_domain) | L1_C_PROTO;
 	if (*pl1pd != l1pd) {
 		*pl1pd = l1pd;
 		PTE_SYNC(pl1pd);
 		rv = 1;
 	}
 
 #ifdef DEBUG
 	/*
 	 * If 'rv == 0' at this point, it generally indicates that there is a
 	 * stale TLB entry for the faulting address. This happens when two or
 	 * more processes are sharing an L1. Since we don't flush the TLB on
 	 * a context switch between such processes, we can take domain faults
 	 * for mappings which exist at the same VA in both processes. EVEN IF
 	 * WE'VE RECENTLY FIXED UP THE CORRESPONDING L1 in pmap_enter(), for
 	 * example.
 	 *
 	 * This is extremely likely to happen if pmap_enter() updated the L1
 	 * entry for a recently entered mapping. In this case, the TLB is
 	 * flushed for the new mapping, but there may still be TLB entries for
 	 * other mappings belonging to other processes in the 1MB range
 	 * covered by the L1 entry.
 	 *
 	 * Since 'rv == 0', we know that the L1 already contains the correct
 	 * value, so the fault must be due to a stale TLB entry.
 	 *
 	 * Since we always need to flush the TLB anyway in the case where we
 	 * fixed up the L1, or frobbed the L2 PTE, we effectively deal with
 	 * stale TLB entries dynamically.
 	 *
 	 * However, the above condition can ONLY happen if the current L1 is
 	 * being shared. If it happens when the L1 is unshared, it indicates
 	 * that other parts of the pmap are not doing their job WRT managing
 	 * the TLB.
 	 */
 	if (rv == 0 && pm->pm_l1->l1_domain_use_count == 1) {
 		printf("fixup: pm %p, va 0x%08x, ftype %d - nothing to do!\n",
 		    pm, va, ftype);
 		printf("fixup: l2 %p, l2b %p, ptep %p, pl1pd %p\n",
 		    l2, l2b, ptep, pl1pd);
 		printf("fixup: pte 0x%x, l1pd 0x%x, last code 0x%x\n",
 		    pte, l1pd, last_fault_code);
 #ifdef DDB
 		Debugger();
 #endif
 	}
 #endif
 
 	cpu_tlb_flushID_SE(va);
 	cpu_cpwait();
 
 	rv = 1;
 
 out:
 	rw_wunlock(&pvh_global_lock);
 	PMAP_UNLOCK(pm);
 	return (rv);
 }
 
 void
 pmap_postinit(void)
 {
 	struct l2_bucket *l2b;
 	struct l1_ttable *l1;
 	pd_entry_t *pl1pt;
 	pt_entry_t *ptep, pte;
 	vm_offset_t va, eva;
 	u_int loop, needed;
 
 	needed = (maxproc / PMAP_DOMAINS) + ((maxproc % PMAP_DOMAINS) ? 1 : 0);
 	needed -= 1;
 	l1 = malloc(sizeof(*l1) * needed, M_VMPMAP, M_WAITOK);
 
 	for (loop = 0; loop < needed; loop++, l1++) {
 		/* Allocate a L1 page table */
 		va = (vm_offset_t)contigmalloc(L1_TABLE_SIZE, M_VMPMAP, 0, 0x0,
 		    0xffffffff, L1_TABLE_SIZE, 0);
 
 		if (va == 0)
 			panic("Cannot allocate L1 KVM");
 
 		eva = va + L1_TABLE_SIZE;
 		pl1pt = (pd_entry_t *)va;
 
 		while (va < eva) {
 				l2b = pmap_get_l2_bucket(pmap_kernel(), va);
 				ptep = &l2b->l2b_kva[l2pte_index(va)];
 				pte = *ptep;
 				pte = (pte & ~L2_S_CACHE_MASK) | pte_l2_s_cache_mode_pt;
 				*ptep = pte;
 				PTE_SYNC(ptep);
 				cpu_tlb_flushD_SE(va);
 
 				va += PAGE_SIZE;
 		}
 		pmap_init_l1(l1, pl1pt);
 	}
 #ifdef DEBUG
 	printf("pmap_postinit: Allocated %d static L1 descriptor tables\n",
 	    needed);
 #endif
 }
 
 /*
  * This is used to stuff certain critical values into the PCB where they
  * can be accessed quickly from cpu_switch() et al.
  */
 void
 pmap_set_pcb_pagedir(pmap_t pm, struct pcb *pcb)
 {
 	struct l2_bucket *l2b;
 
 	pcb->pcb_pagedir = pm->pm_l1->l1_physaddr;
 	pcb->pcb_dacr = (DOMAIN_CLIENT << (PMAP_DOMAIN_KERNEL * 2)) |
 	    (DOMAIN_CLIENT << (pm->pm_domain * 2));
 
 	if (vector_page < KERNBASE) {
 		pcb->pcb_pl1vec = &pm->pm_l1->l1_kva[L1_IDX(vector_page)];
 		l2b = pmap_get_l2_bucket(pm, vector_page);
 		pcb->pcb_l1vec = l2b->l2b_phys | L1_C_PROTO |
 		    L1_C_DOM(pm->pm_domain) | L1_C_DOM(PMAP_DOMAIN_KERNEL);
 	} else
 		pcb->pcb_pl1vec = NULL;
 }
 
 void
 pmap_activate(struct thread *td)
 {
 	pmap_t pm;
 	struct pcb *pcb;
 
 	pm = vmspace_pmap(td->td_proc->p_vmspace);
 	pcb = td->td_pcb;
 
 	critical_enter();
 	pmap_set_pcb_pagedir(pm, pcb);
 
 	if (td == curthread) {
 		u_int cur_dacr, cur_ttb;
 
 		__asm __volatile("mrc p15, 0, %0, c2, c0, 0" : "=r"(cur_ttb));
 		__asm __volatile("mrc p15, 0, %0, c3, c0, 0" : "=r"(cur_dacr));
 
 		cur_ttb &= ~(L1_TABLE_SIZE - 1);
 
 		if (cur_ttb == (u_int)pcb->pcb_pagedir &&
 		    cur_dacr == pcb->pcb_dacr) {
 			/*
 			 * No need to switch address spaces.
 			 */
 			critical_exit();
 			return;
 		}
 
 
 		/*
 		 * We MUST, I repeat, MUST fix up the L1 entry corresponding
 		 * to 'vector_page' in the incoming L1 table before switching
 		 * to it otherwise subsequent interrupts/exceptions (including
 		 * domain faults!) will jump into hyperspace.
 		 */
 		if (pcb->pcb_pl1vec) {
 			*pcb->pcb_pl1vec = pcb->pcb_l1vec;
 		}
 
 		cpu_domains(pcb->pcb_dacr);
 		cpu_setttb(pcb->pcb_pagedir);
 	}
 	critical_exit();
 }
 
 static int
 pmap_set_pt_cache_mode(pd_entry_t *kl1, vm_offset_t va)
 {
 	pd_entry_t *pdep, pde;
 	pt_entry_t *ptep, pte;
 	vm_offset_t pa;
 	int rv = 0;
 
 	/*
 	 * Make sure the descriptor itself has the correct cache mode
 	 */
 	pdep = &kl1[L1_IDX(va)];
 	pde = *pdep;
 
 	if (l1pte_section_p(pde)) {
 		if ((pde & L1_S_CACHE_MASK) != pte_l1_s_cache_mode_pt) {
 			*pdep = (pde & ~L1_S_CACHE_MASK) |
 			    pte_l1_s_cache_mode_pt;
 			PTE_SYNC(pdep);
 			rv = 1;
 		}
 	} else {
 		pa = (vm_paddr_t)(pde & L1_C_ADDR_MASK);
 		ptep = (pt_entry_t *)kernel_pt_lookup(pa);
 		if (ptep == NULL)
 			panic("pmap_bootstrap: No L2 for L2 @ va %p\n", ptep);
 
 		ptep = &ptep[l2pte_index(va)];
 		pte = *ptep;
 		if ((pte & L2_S_CACHE_MASK) != pte_l2_s_cache_mode_pt) {
 			*ptep = (pte & ~L2_S_CACHE_MASK) |
 			    pte_l2_s_cache_mode_pt;
 			PTE_SYNC(ptep);
 			rv = 1;
 		}
 	}
 
 	return (rv);
 }
 
 static void
 pmap_alloc_specials(vm_offset_t *availp, int pages, vm_offset_t *vap,
     pt_entry_t **ptep)
 {
 	vm_offset_t va = *availp;
 	struct l2_bucket *l2b;
 
 	if (ptep) {
 		l2b = pmap_get_l2_bucket(pmap_kernel(), va);
 		if (l2b == NULL)
 			panic("pmap_alloc_specials: no l2b for 0x%x", va);
 
 		*ptep = &l2b->l2b_kva[l2pte_index(va)];
 	}
 
 	*vap = va;
 	*availp = va + (PAGE_SIZE * pages);
 }
 
 /*
  *	Bootstrap the system enough to run with virtual memory.
  *
  *	On the arm this is called after mapping has already been enabled
  *	and just syncs the pmap module with what has already been done.
  *	[We can't call it easily with mapping off since the kernel is not
  *	mapped with PA == VA, hence we would have to relocate every address
  *	from the linked base (virtual) address "KERNBASE" to the actual
  *	(physical) address starting relative to 0]
  */
 #define PMAP_STATIC_L2_SIZE 16
 
 void
 pmap_bootstrap(vm_offset_t firstaddr, vm_offset_t lastaddr, struct pv_addr *l1pt)
 {
 	static struct l1_ttable static_l1;
 	static struct l2_dtable static_l2[PMAP_STATIC_L2_SIZE];
 	struct l1_ttable *l1 = &static_l1;
 	struct l2_dtable *l2;
 	struct l2_bucket *l2b;
 	pd_entry_t pde;
 	pd_entry_t *kernel_l1pt = (pd_entry_t *)l1pt->pv_va;
 	pt_entry_t *ptep;
 	vm_paddr_t pa;
 	vm_offset_t va;
 	vm_size_t size;
 	int l1idx, l2idx, l2next = 0;
 
 	PDEBUG(1, printf("firstaddr = %08x, lastaddr = %08x\n",
 	    firstaddr, lastaddr));
 
 	virtual_avail = firstaddr;
 	kernel_pmap->pm_l1 = l1;
 	kernel_l1pa = l1pt->pv_pa;
 
 	/*
 	 * Scan the L1 translation table created by initarm() and create
 	 * the required metadata for all valid mappings found in it.
 	 */
 	for (l1idx = 0; l1idx < (L1_TABLE_SIZE / sizeof(pd_entry_t)); l1idx++) {
 		pde = kernel_l1pt[l1idx];
 
 		/*
 		 * We're only interested in Coarse mappings.
 		 * pmap_extract() can deal with section mappings without
 		 * recourse to checking L2 metadata.
 		 */
 		if ((pde & L1_TYPE_MASK) != L1_TYPE_C)
 			continue;
 
 		/*
 		 * Lookup the KVA of this L2 descriptor table
 		 */
 		pa = (vm_paddr_t)(pde & L1_C_ADDR_MASK);
 		ptep = (pt_entry_t *)kernel_pt_lookup(pa);
 
 		if (ptep == NULL) {
 			panic("pmap_bootstrap: No L2 for va 0x%x, pa 0x%lx",
 			    (u_int)l1idx << L1_S_SHIFT, (long unsigned int)pa);
 		}
 
 		/*
 		 * Fetch the associated L2 metadata structure.
 		 * Allocate a new one if necessary.
 		 */
 		if ((l2 = kernel_pmap->pm_l2[L2_IDX(l1idx)]) == NULL) {
 			if (l2next == PMAP_STATIC_L2_SIZE)
 				panic("pmap_bootstrap: out of static L2s");
 			kernel_pmap->pm_l2[L2_IDX(l1idx)] = l2 =
 			    &static_l2[l2next++];
 		}
 
 		/*
 		 * One more L1 slot tracked...
 		 */
 		l2->l2_occupancy++;
 
 		/*
 		 * Fill in the details of the L2 descriptor in the
 		 * appropriate bucket.
 		 */
 		l2b = &l2->l2_bucket[L2_BUCKET(l1idx)];
 		l2b->l2b_kva = ptep;
 		l2b->l2b_phys = pa;
 		l2b->l2b_l1idx = l1idx;
 
 		/*
 		 * Establish an initial occupancy count for this descriptor
 		 */
 		for (l2idx = 0;
 		    l2idx < (L2_TABLE_SIZE_REAL / sizeof(pt_entry_t));
 		    l2idx++) {
 			if ((ptep[l2idx] & L2_TYPE_MASK) != L2_TYPE_INV) {
 				l2b->l2b_occupancy++;
 			}
 		}
 
 		/*
 		 * Make sure the descriptor itself has the correct cache mode.
 		 * If not, fix it, but whine about the problem. Port-meisters
 		 * should consider this a clue to fix up their initarm()
 		 * function. :)
 		 */
 		if (pmap_set_pt_cache_mode(kernel_l1pt, (vm_offset_t)ptep)) {
 			printf("pmap_bootstrap: WARNING! wrong cache mode for "
 			    "L2 pte @ %p\n", ptep);
 		}
 	}
 
 
 	/*
 	 * Ensure the primary (kernel) L1 has the correct cache mode for
 	 * a page table. Bitch if it is not correctly set.
 	 */
 	for (va = (vm_offset_t)kernel_l1pt;
 	    va < ((vm_offset_t)kernel_l1pt + L1_TABLE_SIZE); va += PAGE_SIZE) {
 		if (pmap_set_pt_cache_mode(kernel_l1pt, va))
 			printf("pmap_bootstrap: WARNING! wrong cache mode for "
 			    "primary L1 @ 0x%x\n", va);
 	}
 
 	cpu_dcache_wbinv_all();
 	cpu_l2cache_wbinv_all();
 	cpu_tlb_flushID();
 	cpu_cpwait();
 
 	PMAP_LOCK_INIT(kernel_pmap);
 	CPU_FILL(&kernel_pmap->pm_active);
 	kernel_pmap->pm_domain = PMAP_DOMAIN_KERNEL;
 	TAILQ_INIT(&kernel_pmap->pm_pvlist);
 
 	/*
 	 * Initialize the global pv list lock.
 	 */
 	rw_init(&pvh_global_lock, "pmap pv global");
 
 	/*
 	 * Reserve some special page table entries/VA space for temporary
 	 * mapping of pages.
 	 */
 
 	pmap_alloc_specials(&virtual_avail, 1, &csrcp, &csrc_pte);
 	pmap_set_pt_cache_mode(kernel_l1pt, (vm_offset_t)csrc_pte);
 	pmap_alloc_specials(&virtual_avail, 1, &cdstp, &cdst_pte);
 	pmap_set_pt_cache_mode(kernel_l1pt, (vm_offset_t)cdst_pte);
 	size = ((lastaddr - pmap_curmaxkvaddr) + L1_S_OFFSET) / L1_S_SIZE;
 	pmap_alloc_specials(&virtual_avail,
 	    round_page(size * L2_TABLE_SIZE_REAL) / PAGE_SIZE,
 	    &pmap_kernel_l2ptp_kva, NULL);
 
 	size = (size + (L2_BUCKET_SIZE - 1)) / L2_BUCKET_SIZE;
 	pmap_alloc_specials(&virtual_avail,
 	    round_page(size * sizeof(struct l2_dtable)) / PAGE_SIZE,
 	    &pmap_kernel_l2dtable_kva, NULL);
 
 	pmap_alloc_specials(&virtual_avail,
 	    1, (vm_offset_t*)&_tmppt, NULL);
 	pmap_alloc_specials(&virtual_avail,
 	    MAXDUMPPGS, (vm_offset_t *)&crashdumpmap, NULL);
 	SLIST_INIT(&l1_list);
 	TAILQ_INIT(&l1_lru_list);
 	mtx_init(&l1_lru_lock, "l1 list lock", NULL, MTX_DEF);
 	pmap_init_l1(l1, kernel_l1pt);
 	cpu_dcache_wbinv_all();
 	cpu_l2cache_wbinv_all();
 
 	virtual_avail = round_page(virtual_avail);
 	virtual_end = lastaddr;
 	kernel_vm_end = pmap_curmaxkvaddr;
 	arm_nocache_startaddr = lastaddr;
 	mtx_init(&cmtx, "TMP mappings mtx", NULL, MTX_DEF);
 
 	pmap_set_pcb_pagedir(kernel_pmap, thread0.td_pcb);
 }
 
 /***************************************************
  * Pmap allocation/deallocation routines.
  ***************************************************/
 
 /*
  * Release any resources held by the given physical map.
  * Called when a pmap initialized by pmap_pinit is being released.
  * Should only be called if the map contains no valid mappings.
  */
 void
 pmap_release(pmap_t pmap)
 {
 	struct pcb *pcb;
 
 	cpu_idcache_wbinv_all();
 	cpu_l2cache_wbinv_all();
 	cpu_tlb_flushID();
 	cpu_cpwait();
 	if (vector_page < KERNBASE) {
 		struct pcb *curpcb = PCPU_GET(curpcb);
 		pcb = thread0.td_pcb;
 		if (pmap_is_current(pmap)) {
 			/*
 			 * Frob the L1 entry corresponding to the vector
 			 * page so that it contains the kernel pmap's domain
 			 * number. This will ensure pmap_remove() does not
 			 * pull the current vector page out from under us.
 			 */
 			critical_enter();
 			*pcb->pcb_pl1vec = pcb->pcb_l1vec;
 			cpu_domains(pcb->pcb_dacr);
 			cpu_setttb(pcb->pcb_pagedir);
 			critical_exit();
 		}
 		pmap_remove(pmap, vector_page, vector_page + PAGE_SIZE);
 		/*
 		 * Make sure cpu_switch(), et al, DTRT. This is safe to do
 		 * since this process has no remaining mappings of its own.
 		 */
 		curpcb->pcb_pl1vec = pcb->pcb_pl1vec;
 		curpcb->pcb_l1vec = pcb->pcb_l1vec;
 		curpcb->pcb_dacr = pcb->pcb_dacr;
 		curpcb->pcb_pagedir = pcb->pcb_pagedir;
 
 	}
 	pmap_free_l1(pmap);
 	PMAP_LOCK_DESTROY(pmap);
 
 	dprintf("pmap_release()\n");
 }
 
 
 
 /*
  * Helper function for pmap_grow_l2_bucket()
  */
 static __inline int
 pmap_grow_map(vm_offset_t va, pt_entry_t cache_mode, vm_paddr_t *pap)
 {
 	struct l2_bucket *l2b;
 	pt_entry_t *ptep;
 	vm_paddr_t pa;
 	struct vm_page *pg;
 
 	pg = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ | VM_ALLOC_WIRED);
 	if (pg == NULL)
 		return (1);
 	pa = VM_PAGE_TO_PHYS(pg);
 
 	if (pap)
 		*pap = pa;
 
 	l2b = pmap_get_l2_bucket(pmap_kernel(), va);
 
 	ptep = &l2b->l2b_kva[l2pte_index(va)];
 	*ptep = L2_S_PROTO | pa | cache_mode;
 	pmap_set_prot(ptep, VM_PROT_READ | VM_PROT_WRITE, 0);
 	PTE_SYNC(ptep);
 
 	return (0);
 }
 
 /*
  * This is the same as pmap_alloc_l2_bucket(), except that it is only
  * used by pmap_growkernel().
  */
 static __inline struct l2_bucket *
 pmap_grow_l2_bucket(pmap_t pm, vm_offset_t va)
 {
 	struct l2_dtable *l2;
 	struct l2_bucket *l2b;
 	struct l1_ttable *l1;
 	pd_entry_t *pl1pd;
 	u_short l1idx;
 	vm_offset_t nva;
 
 	l1idx = L1_IDX(va);
 
 	if ((l2 = pm->pm_l2[L2_IDX(l1idx)]) == NULL) {
 		/*
 		 * No mapping at this address, as there is
 		 * no entry in the L1 table.
 		 * Need to allocate a new l2_dtable.
 		 */
 		nva = pmap_kernel_l2dtable_kva;
 		if ((nva & PAGE_MASK) == 0) {
 			/*
 			 * Need to allocate a backing page
 			 */
 			if (pmap_grow_map(nva, pte_l2_s_cache_mode, NULL))
 				return (NULL);
 		}
 
 		l2 = (struct l2_dtable *)nva;
 		nva += sizeof(struct l2_dtable);
 
 		if ((nva & PAGE_MASK) < (pmap_kernel_l2dtable_kva &
 		    PAGE_MASK)) {
 			/*
 			 * The new l2_dtable straddles a page boundary.
 			 * Map in another page to cover it.
 			 */
 			if (pmap_grow_map(nva, pte_l2_s_cache_mode, NULL))
 				return (NULL);
 		}
 
 		pmap_kernel_l2dtable_kva = nva;
 
 		/*
 		 * Link it into the parent pmap
 		 */
 		pm->pm_l2[L2_IDX(l1idx)] = l2;
 		memset(l2, 0, sizeof(*l2));
 	}
 
 	l2b = &l2->l2_bucket[L2_BUCKET(l1idx)];
 
 	/*
 	 * Fetch pointer to the L2 page table associated with the address.
 	 */
 	if (l2b->l2b_kva == NULL) {
 		pt_entry_t *ptep;
 
 		/*
 		 * No L2 page table has been allocated. Chances are, this
 		 * is because we just allocated the l2_dtable, above.
 		 */
 		nva = pmap_kernel_l2ptp_kva;
 		ptep = (pt_entry_t *)nva;
 		if ((nva & PAGE_MASK) == 0) {
 			/*
 			 * Need to allocate a backing page
 			 */
 			if (pmap_grow_map(nva, pte_l2_s_cache_mode_pt,
 			    &pmap_kernel_l2ptp_phys))
 				return (NULL);
 		}
 		memset(ptep, 0, L2_TABLE_SIZE_REAL);
 		l2->l2_occupancy++;
 		l2b->l2b_kva = ptep;
 		l2b->l2b_l1idx = l1idx;
 		l2b->l2b_phys = pmap_kernel_l2ptp_phys;
 
 		pmap_kernel_l2ptp_kva += L2_TABLE_SIZE_REAL;
 		pmap_kernel_l2ptp_phys += L2_TABLE_SIZE_REAL;
 	}
 
 	/* Distribute new L1 entry to all other L1s */
 	SLIST_FOREACH(l1, &l1_list, l1_link) {
 			pl1pd = &l1->l1_kva[L1_IDX(va)];
 			*pl1pd = l2b->l2b_phys | L1_C_DOM(PMAP_DOMAIN_KERNEL) |
 			    L1_C_PROTO;
 			PTE_SYNC(pl1pd);
 	}
 
 	return (l2b);
 }
 
 
 /*
  * grow the number of kernel page table entries, if needed
  */
 void
 pmap_growkernel(vm_offset_t addr)
 {
 	pmap_t kpm = pmap_kernel();
 
 	if (addr <= pmap_curmaxkvaddr)
 		return;		/* we are OK */
 
 	/*
 	 * whoops!   we need to add kernel PTPs
 	 */
 
 	/* Map 1MB at a time */
 	for (; pmap_curmaxkvaddr < addr; pmap_curmaxkvaddr += L1_S_SIZE)
 		pmap_grow_l2_bucket(kpm, pmap_curmaxkvaddr);
 
 	/*
 	 * flush out the cache, expensive but growkernel will happen so
 	 * rarely
 	 */
 	cpu_dcache_wbinv_all();
 	cpu_l2cache_wbinv_all();
 	cpu_tlb_flushD();
 	cpu_cpwait();
 	kernel_vm_end = pmap_curmaxkvaddr;
 }
 
 
 /*
  * Remove all pages from specified address space
  * this aids process exit speeds.  Also, this code
  * is special cased for current process only, but
  * can have the more generic (and slightly slower)
  * mode enabled.  This is much faster than pmap_remove
  * in the case of running down an entire address space.
  */
 void
 pmap_remove_pages(pmap_t pmap)
 {
 	struct pv_entry *pv, *npv;
 	struct l2_bucket *l2b = NULL;
 	vm_page_t m;
 	pt_entry_t *pt;
 
 	rw_wlock(&pvh_global_lock);
 	PMAP_LOCK(pmap);
 	for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) {
 		if (pv->pv_flags & PVF_WIRED) {
 			/* Cannot remove wired pages now. */
 			npv = TAILQ_NEXT(pv, pv_plist);
 			continue;
 		}
 		pmap->pm_stats.resident_count--;
 		l2b = pmap_get_l2_bucket(pmap, pv->pv_va);
 		KASSERT(l2b != NULL, ("No L2 bucket in pmap_remove_pages"));
 		pt = &l2b->l2b_kva[l2pte_index(pv->pv_va)];
 		m = PHYS_TO_VM_PAGE(*pt & L2_ADDR_MASK);
 		KASSERT((vm_offset_t)m >= KERNBASE, ("Trying to access non-existent page va %x pte %x", pv->pv_va, *pt));
 		*pt = 0;
 		PTE_SYNC(pt);
 		npv = TAILQ_NEXT(pv, pv_plist);
 		pmap_nuke_pv(m, pmap, pv);
 		if (TAILQ_EMPTY(&m->md.pv_list))
 			vm_page_aflag_clear(m, PGA_WRITEABLE);
 		pmap_free_pv_entry(pv);
 		pmap_free_l2_bucket(pmap, l2b, 1);
 	}
 	rw_wunlock(&pvh_global_lock);
 	cpu_tlb_flushID();
 	cpu_cpwait();
 	PMAP_UNLOCK(pmap);
 }
 
 
 /***************************************************
  * Low level mapping routines.....
  ***************************************************/
 
 #ifdef ARM_HAVE_SUPERSECTIONS
 /* Map a super section into the KVA. */
 
 void
 pmap_kenter_supersection(vm_offset_t va, uint64_t pa, int flags)
 {
 	pd_entry_t pd = L1_S_PROTO | L1_S_SUPERSEC | (pa & L1_SUP_FRAME) |
 	    (((pa >> 32) & 0xf) << 20) | L1_S_PROT(PTE_KERNEL,
 	    VM_PROT_READ|VM_PROT_WRITE) | L1_S_DOM(PMAP_DOMAIN_KERNEL);
 	struct l1_ttable *l1;
 	vm_offset_t va0, va_end;
 
 	KASSERT(((va | pa) & L1_SUP_OFFSET) == 0,
 	    ("Not a valid super section mapping"));
 	if (flags & SECTION_CACHE)
 		pd |= pte_l1_s_cache_mode;
 	else if (flags & SECTION_PT)
 		pd |= pte_l1_s_cache_mode_pt;
 
 	va0 = va & L1_SUP_FRAME;
 	va_end = va + L1_SUP_SIZE;
 	SLIST_FOREACH(l1, &l1_list, l1_link) {
 		va = va0;
 		for (; va < va_end; va += L1_S_SIZE) {
 			l1->l1_kva[L1_IDX(va)] = pd;
 			PTE_SYNC(&l1->l1_kva[L1_IDX(va)]);
 		}
 	}
 }
 #endif
 
 /* Map a section into the KVA. */
 
 void
 pmap_kenter_section(vm_offset_t va, vm_offset_t pa, int flags)
 {
 	pd_entry_t pd = L1_S_PROTO | pa | L1_S_PROT(PTE_KERNEL,
 	    VM_PROT_READ|VM_PROT_WRITE) | L1_S_DOM(PMAP_DOMAIN_KERNEL);
 	struct l1_ttable *l1;
 
 	KASSERT(((va | pa) & L1_S_OFFSET) == 0,
 	    ("Not a valid section mapping"));
 	if (flags & SECTION_CACHE)
 		pd |= pte_l1_s_cache_mode;
 	else if (flags & SECTION_PT)
 		pd |= pte_l1_s_cache_mode_pt;
 
 	SLIST_FOREACH(l1, &l1_list, l1_link) {
 		l1->l1_kva[L1_IDX(va)] = pd;
 		PTE_SYNC(&l1->l1_kva[L1_IDX(va)]);
 	}
 }
 
 /*
  * Make a temporary mapping for a physical address.  This is only intended
  * to be used for panic dumps.
  */
 void *
 pmap_kenter_temp(vm_paddr_t pa, int i)
 {
 	vm_offset_t va;
 
 	va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
 	pmap_kenter(va, pa);
 	return ((void *)crashdumpmap);
 }
 
 /*
  * add a wired page to the kva
  * note that in order for the mapping to take effect -- you
  * should do a invltlb after doing the pmap_kenter...
  */
 static PMAP_INLINE void
 pmap_kenter_internal(vm_offset_t va, vm_offset_t pa, int flags)
 {
 	struct l2_bucket *l2b;
 	pt_entry_t *pte;
 	pt_entry_t opte;
 
 	PDEBUG(1, printf("pmap_kenter: va = %08x, pa = %08x\n",
 	    (uint32_t) va, (uint32_t) pa));
 
 
 	l2b = pmap_get_l2_bucket(pmap_kernel(), va);
 	if (l2b == NULL)
 		l2b = pmap_grow_l2_bucket(pmap_kernel(), va);
 	KASSERT(l2b != NULL, ("No L2 Bucket"));
 
 	pte = &l2b->l2b_kva[l2pte_index(va)];
 	opte = *pte;
 	if (l2pte_valid(opte)) {
 		cpu_tlb_flushD_SE(va);
 		cpu_cpwait();
 	} else {
 		if (opte == 0)
 			l2b->l2b_occupancy++;
 	}
 
 	if (flags & KENTER_CACHE) {
 		*pte = L2_S_PROTO | pa | pte_l2_s_cache_mode;
 		pmap_set_prot(pte, VM_PROT_READ | VM_PROT_WRITE,
 		    flags & KENTER_USER);
 	} else {
 		*pte = L2_S_PROTO | pa;
 		pmap_set_prot(pte, VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE,
 		    0);
 	}
 
 	PDEBUG(1, printf("pmap_kenter: pte = %08x, opte = %08x, npte = %08x\n",
 	    (uint32_t) pte, opte, *pte));
 	PTE_SYNC(pte);
 	cpu_cpwait();
 }
 
 void
 pmap_kenter(vm_offset_t va, vm_paddr_t pa)
 {
 	pmap_kenter_internal(va, pa, KENTER_CACHE);
 }
 
 void
 pmap_kenter_nocache(vm_offset_t va, vm_paddr_t pa)
 {
 
 	pmap_kenter_internal(va, pa, 0);
 }
 
 void
 pmap_kenter_user(vm_offset_t va, vm_paddr_t pa)
 {
 
 	pmap_kenter_internal(va, pa, KENTER_CACHE|KENTER_USER);
 	/*
 	 * Call pmap_fault_fixup now, to make sure we'll have no exception
 	 * at the first use of the new address, or bad things will happen,
 	 * as we use one of these addresses in the exception handlers.
 	 */
 	pmap_fault_fixup(pmap_kernel(), va, VM_PROT_READ|VM_PROT_WRITE, 1);
 }
 
 vm_paddr_t
 pmap_kextract(vm_offset_t va)
 {
 
 	return (pmap_extract_locked(kernel_pmap, va));
 }
 
 /*
  * remove a page from the kernel pagetables
  */
 void
 pmap_kremove(vm_offset_t va)
 {
 	struct l2_bucket *l2b;
 	pt_entry_t *pte, opte;
 
 	l2b = pmap_get_l2_bucket(pmap_kernel(), va);
 	if (!l2b)
 		return;
 	KASSERT(l2b != NULL, ("No L2 Bucket"));
 	pte = &l2b->l2b_kva[l2pte_index(va)];
 	opte = *pte;
 	if (l2pte_valid(opte)) {
 		va = va & ~PAGE_MASK;
 		cpu_tlb_flushD_SE(va);
 		cpu_cpwait();
 		*pte = 0;
 		PTE_SYNC(pte);
 	}
 }
 
 
 /*
  *	Used to map a range of physical addresses into kernel
  *	virtual address space.
  *
  *	The value passed in '*virt' is a suggested virtual address for
  *	the mapping. Architectures which can support a direct-mapped
  *	physical to virtual region can return the appropriate address
  *	within that region, leaving '*virt' unchanged. Other
  *	architectures should map the pages starting at '*virt' and
  *	update '*virt' with the first usable address after the mapped
  *	region.
  */
 vm_offset_t
 pmap_map(vm_offset_t *virt, vm_offset_t start, vm_offset_t end, int prot)
 {
 	vm_offset_t sva = *virt;
 	vm_offset_t va = sva;
 
 	PDEBUG(1, printf("pmap_map: virt = %08x, start = %08x, end = %08x, "
 	    "prot = %d\n", (uint32_t) *virt, (uint32_t) start, (uint32_t) end,
 	    prot));
 
 	while (start < end) {
 		pmap_kenter(va, start);
 		va += PAGE_SIZE;
 		start += PAGE_SIZE;
 	}
 	*virt = va;
 	return (sva);
 }
 
 /*
  * Add a list of wired pages to the kva
  * this routine is only used for temporary
  * kernel mappings that do not need to have
  * page modification or references recorded.
  * Note that old mappings are simply written
  * over.  The page *must* be wired.
  */
 void
 pmap_qenter(vm_offset_t va, vm_page_t *m, int count)
 {
 	int i;
 
 	for (i = 0; i < count; i++) {
 		pmap_kenter_internal(va, VM_PAGE_TO_PHYS(m[i]),
 		    KENTER_CACHE);
 		va += PAGE_SIZE;
 	}
 }
 
 
 /*
  * this routine jerks page mappings from the
  * kernel -- it is meant only for temporary mappings.
  */
 void
 pmap_qremove(vm_offset_t va, int count)
 {
 	int i;
 
 	for (i = 0; i < count; i++) {
 		if (vtophys(va))
 			pmap_kremove(va);
 
 		va += PAGE_SIZE;
 	}
 }
 
 
 /*
  * pmap_object_init_pt preloads the ptes for a given object
  * into the specified pmap.  This eliminates the blast of soft
  * faults on process startup and immediately after an mmap.
  */
 void
 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
     vm_pindex_t pindex, vm_size_t size)
 {
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
 	    ("pmap_object_init_pt: non-device object"));
 }
 
 
 /*
  *	pmap_is_prefaultable:
  *
  *	Return whether or not the specified virtual address is elgible
  *	for prefault.
  */
 boolean_t
 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
 {
 	pd_entry_t *pde;
 	pt_entry_t *pte;
 
 	if (!pmap_get_pde_pte(pmap, addr, &pde, &pte))
 		return (FALSE);
 	KASSERT(pte != NULL, ("Valid mapping but no pte ?"));
 	if (*pte == 0)
 		return (TRUE);
 	return (FALSE);
 }
 
 /*
  * Fetch pointers to the PDE/PTE for the given pmap/VA pair.
  * Returns TRUE if the mapping exists, else FALSE.
  *
  * NOTE: This function is only used by a couple of arm-specific modules.
  * It is not safe to take any pmap locks here, since we could be right
  * in the middle of debugging the pmap anyway...
  *
  * It is possible for this routine to return FALSE even though a valid
  * mapping does exist. This is because we don't lock, so the metadata
  * state may be inconsistent.
  *
  * NOTE: We can return a NULL *ptp in the case where the L1 pde is
  * a "section" mapping.
  */
 boolean_t
 pmap_get_pde_pte(pmap_t pm, vm_offset_t va, pd_entry_t **pdp, pt_entry_t **ptp)
 {
 	struct l2_dtable *l2;
 	pd_entry_t *pl1pd, l1pd;
 	pt_entry_t *ptep;
 	u_short l1idx;
 
 	if (pm->pm_l1 == NULL)
 		return (FALSE);
 
 	l1idx = L1_IDX(va);
 	*pdp = pl1pd = &pm->pm_l1->l1_kva[l1idx];
 	l1pd = *pl1pd;
 
 	if (l1pte_section_p(l1pd)) {
 		*ptp = NULL;
 		return (TRUE);
 	}
 
 	if (pm->pm_l2 == NULL)
 		return (FALSE);
 
 	l2 = pm->pm_l2[L2_IDX(l1idx)];
 
 	if (l2 == NULL ||
 	    (ptep = l2->l2_bucket[L2_BUCKET(l1idx)].l2b_kva) == NULL) {
 		return (FALSE);
 	}
 
 	*ptp = &ptep[l2pte_index(va)];
 	return (TRUE);
 }
 
 /*
  *      Routine:        pmap_remove_all
  *      Function:
  *              Removes this physical page from
  *              all physical maps in which it resides.
  *              Reflects back modify bits to the pager.
  *
  *      Notes:
  *              Original versions of this routine were very
  *              inefficient because they iteratively called
  *              pmap_remove (slow...)
  */
 void
 pmap_remove_all(vm_page_t m)
 {
 	pv_entry_t pv;
 	pt_entry_t *ptep;
 	struct l2_bucket *l2b;
 	boolean_t flush = FALSE;
 	pmap_t curpm;
 	int flags = 0;
 
 	KASSERT((m->flags & PG_FICTITIOUS) == 0,
 	    ("pmap_remove_all: page %p is fictitious", m));
 
 	if (TAILQ_EMPTY(&m->md.pv_list))
 		return;
 	rw_wlock(&pvh_global_lock);
 	curpm = vmspace_pmap(curproc->p_vmspace);
 	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
 		if (flush == FALSE && (pv->pv_pmap == curpm ||
 		    pv->pv_pmap == pmap_kernel()))
 			flush = TRUE;
 
 		PMAP_LOCK(pv->pv_pmap);
 		l2b = pmap_get_l2_bucket(pv->pv_pmap, pv->pv_va);
 		KASSERT(l2b != NULL, ("No l2 bucket"));
 		ptep = &l2b->l2b_kva[l2pte_index(pv->pv_va)];
 		if (L2_S_WRITABLE(*ptep))
 			vm_page_dirty(m);
 		*ptep = 0;
 		if (pmap_is_current(pv->pv_pmap))
 			PTE_SYNC(ptep);
 		pmap_free_l2_bucket(pv->pv_pmap, l2b, 1);
 		pv->pv_pmap->pm_stats.resident_count--;
 		flags |= pv->pv_flags;
 		pmap_nuke_pv(m, pv->pv_pmap, pv);
 		PMAP_UNLOCK(pv->pv_pmap);
 		pmap_free_pv_entry(pv);
 	}
 	m->md.pvh_attrs &= ~(PVF_MOD | PVF_REF);
 
 	if (flush) {
 		if (PV_BEEN_EXECD(flags))
 			cpu_tlb_flushID();
 		else
 			cpu_tlb_flushD();
 	}
 	vm_page_aflag_clear(m, PGA_WRITEABLE);
 	rw_wunlock(&pvh_global_lock);
 }
 
 int
 pmap_change_attr(vm_offset_t sva, vm_size_t len, int mode)
 {
 	vm_offset_t base, offset, tmpva;
 	vm_size_t size;
 	struct l2_bucket *l2b;
 	pt_entry_t *ptep, pte;
 	vm_offset_t next_bucket;
 
 	PMAP_LOCK(kernel_pmap);
 
 	base = trunc_page(sva);
 	offset = sva & PAGE_MASK;
 	size = roundup(offset + len, PAGE_SIZE);
 
 #ifdef checkit
 	/*
 	 * Only supported on kernel virtual addresses, including the direct
 	 * map but excluding the recursive map.
 	 */
 	if (base < DMAP_MIN_ADDRESS) {
 		PMAP_UNLOCK(kernel_pmap);
 		return (EINVAL);
 	}
 #endif
 	for (tmpva = base; tmpva < base + size; ) {
 		next_bucket = L2_NEXT_BUCKET(tmpva);
 		if (next_bucket > base + size)
 			next_bucket = base + size;
 
 		l2b = pmap_get_l2_bucket(kernel_pmap, tmpva);
 		if (l2b == NULL) {
 			tmpva = next_bucket;
 			continue;
 		}
 
 		ptep = &l2b->l2b_kva[l2pte_index(tmpva)];
 
 		if (*ptep == 0) {
 			PMAP_UNLOCK(kernel_pmap);
 			return(EINVAL);
 		}
 
 		pte = *ptep &~ L2_S_CACHE_MASK;
 		cpu_idcache_wbinv_range(tmpva, PAGE_SIZE);
 		pmap_l2cache_wbinv_range(tmpva, pte & L2_S_FRAME, PAGE_SIZE);
 		*ptep = pte;
 		cpu_tlb_flushID_SE(tmpva);
 
 		dprintf("%s: for va:%x ptep:%x pte:%x\n",
 		    __func__, tmpva, (uint32_t)ptep, pte);
 		tmpva += PAGE_SIZE;
 	}
 
 	PMAP_UNLOCK(kernel_pmap);
 
 	return (0);
 }
 
 /*
  *	Set the physical protection on the
  *	specified range of this map as requested.
  */
 void
 pmap_protect(pmap_t pm, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
 {
 	struct l2_bucket *l2b;
 	pt_entry_t *ptep, pte;
 	vm_offset_t next_bucket;
 	u_int flags;
 	int flush;
 
 	if ((prot & VM_PROT_READ) == 0) {
 		pmap_remove(pm, sva, eva);
 		return;
 	}
 
 	if (prot & VM_PROT_WRITE) {
 		/*
 		 * If this is a read->write transition, just ignore it and let
 		 * vm_fault() take care of it later.
 		 */
 		return;
 	}
 
 	rw_wlock(&pvh_global_lock);
 	PMAP_LOCK(pm);
 
 	/*
 	 * OK, at this point, we know we're doing write-protect operation.
 	 * If the pmap is active, write-back the range.
 	 */
 
 	flush = ((eva - sva) >= (PAGE_SIZE * 4)) ? 0 : -1;
 	flags = 0;
 
 	while (sva < eva) {
 		next_bucket = L2_NEXT_BUCKET(sva);
 		if (next_bucket > eva)
 			next_bucket = eva;
 
 		l2b = pmap_get_l2_bucket(pm, sva);
 		if (l2b == NULL) {
 			sva = next_bucket;
 			continue;
 		}
 
 		ptep = &l2b->l2b_kva[l2pte_index(sva)];
 
 		while (sva < next_bucket) {
 			if ((pte = *ptep) != 0 && L2_S_WRITABLE(pte)) {
 				struct vm_page *pg;
 				u_int f;
 
 				pg = PHYS_TO_VM_PAGE(l2pte_pa(pte));
 				pmap_set_prot(ptep, prot, !(pm == pmap_kernel()));
 				PTE_SYNC(ptep);
 
 				f = pmap_modify_pv(pg, pm, sva,
 				    PVF_WRITE, 0);
 				if (f & PVF_WRITE)
 					vm_page_dirty(pg);
 
 				if (flush >= 0) {
 					flush++;
 					flags |= f;
 				} else
 				if (PV_BEEN_EXECD(f))
 					cpu_tlb_flushID_SE(sva);
 				else
 				if (PV_BEEN_REFD(f))
 					cpu_tlb_flushD_SE(sva);
 			}
 
 			sva += PAGE_SIZE;
 			ptep++;
 		}
 	}
 
 
 	if (flush) {
 		if (PV_BEEN_EXECD(flags))
 			cpu_tlb_flushID();
 		else
 		if (PV_BEEN_REFD(flags))
 			cpu_tlb_flushD();
 	}
 	rw_wunlock(&pvh_global_lock);
 
 	PMAP_UNLOCK(pm);
 }
 
 
 /*
  *	Insert the given physical page (p) at
  *	the specified virtual address (v) in the
  *	target physical map with the protection requested.
  *
  *	If specified, the page will be wired down, meaning
  *	that the related pte can not be reclaimed.
  *
  *	NB:  This is the only routine which MAY NOT lazy-evaluate
  *	or lose information.  That is, this routine must actually
  *	insert this page into the given map NOW.
  */
 
 void
 pmap_enter(pmap_t pmap, vm_offset_t va, vm_prot_t access, vm_page_t m,
     vm_prot_t prot, boolean_t wired)
 {
 
 	rw_wlock(&pvh_global_lock);
 	PMAP_LOCK(pmap);
 	pmap_enter_locked(pmap, va, m, prot, wired, M_WAITOK);
 	PMAP_UNLOCK(pmap);
 	rw_wunlock(&pvh_global_lock);
 }
 
 /*
  *	The pvh global and pmap locks must be held.
  */
 static void
 pmap_enter_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
     boolean_t wired, int flags)
 {
 	struct l2_bucket *l2b = NULL;
 	struct vm_page *opg;
 	struct pv_entry *pve = NULL;
 	pt_entry_t *ptep, npte, opte;
 	u_int nflags;
 	u_int oflags;
 	vm_paddr_t pa;
 	u_char user;
 
 	PMAP_ASSERT_LOCKED(pmap);
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	if (va == vector_page) {
 		pa = systempage.pv_pa;
 		m = NULL;
 	} else {
 		KASSERT((m->oflags & (VPO_UNMANAGED | VPO_BUSY)) != 0 ||
 		    (flags & M_NOWAIT) != 0,
 		    ("pmap_enter_locked: page %p is not busy", m));
 		pa = VM_PAGE_TO_PHYS(m);
 	}
 
 	user = 0;
 	/*
 	 * Make sure userland mappings get the right permissions
 	 */
 	if (pmap != pmap_kernel() && va != vector_page)
 		user = 1;
 
 	nflags = 0;
 
 	if (prot & VM_PROT_WRITE)
 		nflags |= PVF_WRITE;
 	if (prot & VM_PROT_EXECUTE)
 		nflags |= PVF_EXEC;
 	if (wired)
 		nflags |= PVF_WIRED;
 
 	PDEBUG(1, printf("pmap_enter: pmap = %08x, va = %08x, m = %08x, prot = %x, "
 	    "wired = %x\n", (uint32_t) pmap, va, (uint32_t) m, prot, wired));
 
 	if (pmap == pmap_kernel()) {
 		l2b = pmap_get_l2_bucket(pmap, va);
 		if (l2b == NULL)
 			l2b = pmap_grow_l2_bucket(pmap, va);
 	} else {
 do_l2b_alloc:
 		l2b = pmap_alloc_l2_bucket(pmap, va);
 		if (l2b == NULL) {
 			if (flags & M_WAITOK) {
 				PMAP_UNLOCK(pmap);
 				rw_wunlock(&pvh_global_lock);
 				VM_WAIT;
 				rw_wlock(&pvh_global_lock);
 				PMAP_LOCK(pmap);
 				goto do_l2b_alloc;
 			}
 			return;
 		}
 	}
 
 	ptep = &l2b->l2b_kva[l2pte_index(va)];
 
 	opte = *ptep;
 	npte = pa;
 	oflags = 0;
 	if (opte) {
 		/*
 		 * There is already a mapping at this address.
 		 * If the physical address is different, lookup the
 		 * vm_page.
 		 */
 		if (l2pte_pa(opte) != pa)
 			opg = PHYS_TO_VM_PAGE(l2pte_pa(opte));
 		else
 			opg = m;
 	} else
 		opg = NULL;
 
 	if ((prot & (VM_PROT_ALL)) ||
 	    (!m || m->md.pvh_attrs & PVF_REF)) {
 		/*
 		 * - The access type indicates that we don't need
 		 *   to do referenced emulation.
 		 * OR
 		 * - The physical page has already been referenced
 		 *   so no need to re-do referenced emulation here.
 		 */
 		npte |= L2_S_PROTO;
 #ifdef SMP
 		npte |= L2_SHARED;
 #endif
 
 		nflags |= PVF_REF;
 
 		if (m && ((prot & VM_PROT_WRITE) != 0 ||
 		    (m->md.pvh_attrs & PVF_MOD))) {
 			/*
 			 * This is a writable mapping, and the
 			 * page's mod state indicates it has
 			 * already been modified. Make it
 			 * writable from the outset.
 			 */
 			nflags |= PVF_MOD;
 			if (!(m->md.pvh_attrs & PVF_MOD))
 				vm_page_dirty(m);
 		}
 		if (m && opte)
 			vm_page_aflag_set(m, PGA_REFERENCED);
 	} else {
 		/*
 		 * Need to do page referenced emulation.
 		 */
 		npte |= L2_TYPE_INV;
 	}
 
 	npte |= L2_S_PROT_R;
 
 	if (prot & VM_PROT_WRITE) {
 		npte &= ~(L2_APX);
 
 		if (m != NULL &&
 		    (m->oflags & VPO_UNMANAGED) == 0)
 			vm_page_aflag_set(m, PGA_WRITEABLE);
 	}
 
 	if (user)
 		npte |= L2_S_PROT_U;
 
 
 	if (!(prot & VM_PROT_EXECUTE) && m)
 		npte |= L2_XN;
 
 	if (m->md.pv_memattr != VM_MEMATTR_UNCACHEABLE)
 		npte |= pte_l2_s_cache_mode;
 
 	if (m && m == opg) {
 		/*
 		 * We're changing the attrs of an existing mapping.
 		 */
 		oflags = pmap_modify_pv(m, pmap, va,
 		    PVF_WRITE | PVF_EXEC | PVF_WIRED |
 		    PVF_MOD | PVF_REF, nflags);
 	} else {
 		/*
 		 * New mapping, or changing the backing page
 		 * of an existing mapping.
 		 */
 		if (opg) {
 			/*
 			 * Replacing an existing mapping with a new one.
 			 * It is part of our managed memory so we
 			 * must remove it from the PV list
 			 */
 			if ((pve = pmap_remove_pv(opg, pmap, va))) {
 			    oflags = pve->pv_flags;
 
 			    if (m && ((m->oflags & VPO_UNMANAGED))) {
 				pmap_free_pv_entry(pve);
 				pve = NULL;
 			    }
 			}
 		}
 
 		if ((m && !(m->oflags & VPO_UNMANAGED))) {
 			if ((!pve) && (pve = pmap_get_pv_entry()) == NULL)
 				panic("pmap_enter: no pv entries");
 
 			KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva,
 			("pmap_enter: managed mapping within the clean submap"));
 			KASSERT(pve != NULL, ("No pv"));
 			pmap_enter_pv(m, pve, pmap, va, nflags);
 		}
 	}
 
 	/*
 	 * Keep the stats up to date
 	 */
 	if (opte == 0) {
 		l2b->l2b_occupancy++;
 		pmap->pm_stats.resident_count++;
 	}
 
 	CTR5(KTR_PMAP,"enter: pmap:%p va:%x prot:%x pte:%x->%x",
 	    pmap, va, prot, opte, npte);
 	/*
 	 * If this is just a wiring change, the two PTEs will be
 	 * identical, so there's no need to update the page table.
 	 */
 	if (npte != opte) {
 		boolean_t is_cached = pmap_is_current(pmap);
 
 		*ptep = npte;
 		PTE_SYNC(ptep);
 		if (is_cached) {
 			/*
 			 * We only need to frob the cache/tlb if this pmap
 			 * is current
 			 */
 			if (L1_IDX(va) != L1_IDX(vector_page) &&
 			    l2pte_valid(npte)) {
 				/*
 				 * This mapping is likely to be accessed as
 				 * soon as we return to userland. Fix up the
 				 * L1 entry to avoid taking another
 				 * page/domain fault.
 				 */
 				pd_entry_t *pl1pd, l1pd;
 
 				pl1pd = &pmap->pm_l1->l1_kva[L1_IDX(va)];
 				l1pd = l2b->l2b_phys | L1_C_DOM(pmap->pm_domain) |
 				    L1_C_PROTO;
 				if (*pl1pd != l1pd) {
 					*pl1pd = l1pd;
 					PTE_SYNC(pl1pd);
 				}
 			}
 		}
 
 		if (PV_BEEN_EXECD(oflags))
 			cpu_tlb_flushID_SE(va);
 		else if (PV_BEEN_REFD(oflags))
 			cpu_tlb_flushD_SE(va);
 	}
 
 	if ((pmap != pmap_kernel()) && (pmap == &curproc->p_vmspace->vm_pmap))
 		cpu_icache_sync_range(va, PAGE_SIZE);
 }
 
 /*
  * Maps a sequence of resident pages belonging to the same object.
  * The sequence begins with the given page m_start.  This page is
  * mapped at the given virtual address start.  Each subsequent page is
  * mapped at a virtual address that is offset from start by the same
  * amount as the page is offset from m_start within the object.  The
  * last page in the sequence is the page with the largest offset from
  * m_start that can be mapped at a virtual address less than the given
  * virtual address end.  Not every virtual page between start and end
  * is mapped; only those for which a resident page exists with the
  * corresponding offset from m_start are mapped.
  */
 void
 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
     vm_page_t m_start, vm_prot_t prot)
 {
 	vm_page_t m;
 	vm_pindex_t diff, psize;
 
 	psize = atop(end - start);
 	m = m_start;
 	rw_wlock(&pvh_global_lock);
 	PMAP_LOCK(pmap);
 	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
 		pmap_enter_locked(pmap, start + ptoa(diff), m, prot &
 		    (VM_PROT_READ | VM_PROT_EXECUTE), FALSE, M_NOWAIT);
 		m = TAILQ_NEXT(m, listq);
 	}
 	PMAP_UNLOCK(pmap);
 	rw_wunlock(&pvh_global_lock);
 }
 
 /*
  * this code makes some *MAJOR* assumptions:
  * 1. Current pmap & pmap exists.
  * 2. Not wired.
  * 3. Read access.
  * 4. No page table pages.
  * but is *MUCH* faster than pmap_enter...
  */
 
 void
 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
 {
 
 	rw_wlock(&pvh_global_lock);
 	PMAP_LOCK(pmap);
 	pmap_enter_locked(pmap, va, m, prot & (VM_PROT_READ | VM_PROT_EXECUTE),
 	    FALSE, M_NOWAIT);
 	PMAP_UNLOCK(pmap);
 	rw_wunlock(&pvh_global_lock);
 }
 
 /*
  *	Routine:	pmap_change_wiring
  *	Function:	Change the wiring attribute for a map/virtual-address
  *			pair.
  *	In/out conditions:
  *			The mapping must already exist in the pmap.
  */
 void
 pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired)
 {
 	struct l2_bucket *l2b;
 	pt_entry_t *ptep, pte;
 	vm_page_t pg;
 
 	rw_wlock(&pvh_global_lock);
 	PMAP_LOCK(pmap);
 	l2b = pmap_get_l2_bucket(pmap, va);
 	KASSERT(l2b, ("No l2b bucket in pmap_change_wiring"));
 	ptep = &l2b->l2b_kva[l2pte_index(va)];
 	pte = *ptep;
 	pg = PHYS_TO_VM_PAGE(l2pte_pa(pte));
 	if (pg)
 		pmap_modify_pv(pg, pmap, va, PVF_WIRED, wired);
 	rw_wunlock(&pvh_global_lock);
 	PMAP_UNLOCK(pmap);
 }
 
 
 /*
  *	Copy the range specified by src_addr/len
  *	from the source map to the range dst_addr/len
  *	in the destination map.
  *
  *	This routine is only advisory and need not do anything.
  */
 void
 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr,
     vm_size_t len, vm_offset_t src_addr)
 {
 }
 
 
 /*
  *	Routine:	pmap_extract
  *	Function:
  *		Extract the physical page address associated
  *		with the given map/virtual_address pair.
  */
 vm_paddr_t
 pmap_extract(pmap_t pmap, vm_offset_t va)
 {
 	vm_paddr_t pa;
 
 	PMAP_LOCK(pmap);
 	pa = pmap_extract_locked(pmap, va);
 	PMAP_UNLOCK(pmap);
 	return (pa);
 }
 
 static vm_paddr_t
 pmap_extract_locked(pmap_t pmap, vm_offset_t va)
 {
 	struct l2_dtable *l2;
 	pd_entry_t l1pd;
 	pt_entry_t *ptep, pte;
 	vm_paddr_t pa;
 	u_int l1idx;
 
 	if (pmap != kernel_pmap)
 		PMAP_ASSERT_LOCKED(pmap);
 	l1idx = L1_IDX(va);
 	l1pd = pmap->pm_l1->l1_kva[l1idx];
 	if (l1pte_section_p(l1pd)) {
 		/*
 		 * These should only happen for the kernel pmap.
 		 */
 		KASSERT(pmap == kernel_pmap, ("unexpected section"));
 		/* XXX: what to do about the bits > 32 ? */
 		if (l1pd & L1_S_SUPERSEC)
 			pa = (l1pd & L1_SUP_FRAME) | (va & L1_SUP_OFFSET);
 		else
 			pa = (l1pd & L1_S_FRAME) | (va & L1_S_OFFSET);
 	} else {
 		/*
 		 * Note that we can't rely on the validity of the L1
 		 * descriptor as an indication that a mapping exists.
 		 * We have to look it up in the L2 dtable.
 		 */
 		l2 = pmap->pm_l2[L2_IDX(l1idx)];
 		if (l2 == NULL ||
 		    (ptep = l2->l2_bucket[L2_BUCKET(l1idx)].l2b_kva) == NULL)
 			return (0);
 		pte = ptep[l2pte_index(va)];
 		if (pte == 0)
 			return (0);
 		switch (pte & L2_TYPE_MASK) {
 		case L2_TYPE_L:
 			pa = (pte & L2_L_FRAME) | (va & L2_L_OFFSET);
 			break;
 		default:
 			pa = (pte & L2_S_FRAME) | (va & L2_S_OFFSET);
 			break;
 		}
 	}
 	return (pa);
 }
 
 /*
  * Atomically extract and hold the physical page with the given
  * pmap and virtual address pair if that mapping permits the given
  * protection.
  *
  */
 vm_page_t
 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
 {
 	struct l2_dtable *l2;
 	pd_entry_t l1pd;
 	pt_entry_t *ptep, pte;
 	vm_paddr_t pa, paddr;
 	vm_page_t m = NULL;
 	u_int l1idx;
 	l1idx = L1_IDX(va);
 	paddr = 0;
 
 	PMAP_LOCK(pmap);
 retry:
 	l1pd = pmap->pm_l1->l1_kva[l1idx];
 	if (l1pte_section_p(l1pd)) {
 		/*
 		 * These should only happen for pmap_kernel()
 		 */
 		KASSERT(pmap == pmap_kernel(), ("huh"));
 		/* XXX: what to do about the bits > 32 ? */
 		if (l1pd & L1_S_SUPERSEC)
 			pa = (l1pd & L1_SUP_FRAME) | (va & L1_SUP_OFFSET);
 		else
 			pa = (l1pd & L1_S_FRAME) | (va & L1_S_OFFSET);
 		if (vm_page_pa_tryrelock(pmap, pa & PG_FRAME, &paddr))
 			goto retry;
 		if (L1_S_WRITABLE(l1pd) || (prot & VM_PROT_WRITE) == 0) {
 			m = PHYS_TO_VM_PAGE(pa);
 			vm_page_hold(m);
 		}
 	} else {
 		/*
 		 * Note that we can't rely on the validity of the L1
 		 * descriptor as an indication that a mapping exists.
 		 * We have to look it up in the L2 dtable.
 		 */
 		l2 = pmap->pm_l2[L2_IDX(l1idx)];
 
 		if (l2 == NULL ||
 		    (ptep = l2->l2_bucket[L2_BUCKET(l1idx)].l2b_kva) == NULL) {
 			PMAP_UNLOCK(pmap);
 			return (NULL);
 		}
 
 		ptep = &ptep[l2pte_index(va)];
 		pte = *ptep;
 
 		if (pte == 0) {
 			PMAP_UNLOCK(pmap);
 			return (NULL);
 		} else if ((prot & VM_PROT_WRITE) && (pte & L2_APX)) {
 			PMAP_UNLOCK(pmap);
 			return (NULL);
 		} else {
 			switch (pte & L2_TYPE_MASK) {
 			case L2_TYPE_L:
 				panic("extract and hold section mapping");
 				break;
 			default:
 				pa = (pte & L2_S_FRAME) | (va & L2_S_OFFSET);
 				break;
 			}
 			if (vm_page_pa_tryrelock(pmap, pa & PG_FRAME, &paddr))
 				goto retry;
 			m = PHYS_TO_VM_PAGE(pa);
 			vm_page_hold(m);
 		}
 
 	}
 
 	PMAP_UNLOCK(pmap);
 	PA_UNLOCK_COND(paddr);
 	return (m);
 }
 
 /*
  * Initialize a preallocated and zeroed pmap structure,
  * such as one in a vmspace structure.
  */
 
 int
 pmap_pinit(pmap_t pmap)
 {
 	PDEBUG(1, printf("pmap_pinit: pmap = %08x\n", (uint32_t) pmap));
 
 	PMAP_LOCK_INIT(pmap);
 	pmap_alloc_l1(pmap);
 	bzero(pmap->pm_l2, sizeof(pmap->pm_l2));
 
 	CPU_ZERO(&pmap->pm_active);
 
 	TAILQ_INIT(&pmap->pm_pvlist);
 	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
 	pmap->pm_stats.resident_count = 1;
 	if (vector_page < KERNBASE) {
 		pmap_enter(pmap, vector_page,
 		    VM_PROT_READ, PHYS_TO_VM_PAGE(systempage.pv_pa),
 		    VM_PROT_READ, 1);
 	}
 	return (1);
 }
 
 
 /***************************************************
  * page management routines.
  ***************************************************/
 
 
 static void
 pmap_free_pv_entry(pv_entry_t pv)
 {
 	pv_entry_count--;
 	uma_zfree(pvzone, pv);
 }
 
 
 /*
  * get a new pv_entry, allocating a block from the system
  * when needed.
  * the memory allocation is performed bypassing the malloc code
  * because of the possibility of allocations at interrupt time.
  */
 static pv_entry_t
 pmap_get_pv_entry(void)
 {
 	pv_entry_t ret_value;
 
 	pv_entry_count++;
 	if (pv_entry_count > pv_entry_high_water)
 		pagedaemon_wakeup();
 	ret_value = uma_zalloc(pvzone, M_NOWAIT);
 	return ret_value;
 }
 
 /*
  *	Remove the given range of addresses from the specified map.
  *
  *	It is assumed that the start and end are properly
  *	rounded to the page size.
  */
 #define	PMAP_REMOVE_CLEAN_LIST_SIZE	3
 void
 pmap_remove(pmap_t pm, vm_offset_t sva, vm_offset_t eva)
 {
 	struct l2_bucket *l2b;
 	vm_offset_t next_bucket;
 	pt_entry_t *ptep;
 	u_int total;
 	u_int mappings, is_exec, is_refd;
 	int flushall = 0;
 
 
 	/*
 	 * we lock in the pmap => pv_head direction
 	 */
 
 	rw_wlock(&pvh_global_lock);
 	PMAP_LOCK(pm);
 	total = 0;
 	while (sva < eva) {
 		/*
 		 * Do one L2 bucket's worth at a time.
 		 */
 		next_bucket = L2_NEXT_BUCKET(sva);
 		if (next_bucket > eva)
 			next_bucket = eva;
 
 		l2b = pmap_get_l2_bucket(pm, sva);
 		if (l2b == NULL) {
 			sva = next_bucket;
 			continue;
 		}
 
 		ptep = &l2b->l2b_kva[l2pte_index(sva)];
 		mappings = 0;
 
 		while (sva < next_bucket) {
 			struct vm_page *pg;
 			pt_entry_t pte;
 			vm_paddr_t pa;
 
 			pte = *ptep;
 
 			if (pte == 0) {
 				/*
 				 * Nothing here, move along
 				 */
 				sva += PAGE_SIZE;
 				ptep++;
 				continue;
 			}
 
 			pm->pm_stats.resident_count--;
 			pa = l2pte_pa(pte);
 			is_exec = 0;
 			is_refd = 1;
 
 			/*
 			 * Update flags. In a number of circumstances,
 			 * we could cluster a lot of these and do a
 			 * number of sequential pages in one go.
 			 */
 			if ((pg = PHYS_TO_VM_PAGE(pa)) != NULL) {
 				struct pv_entry *pve;
 
 				pve = pmap_remove_pv(pg, pm, sva);
 				if (pve) {
 					is_exec = PV_BEEN_EXECD(pve->pv_flags);
 					is_refd = PV_BEEN_REFD(pve->pv_flags);
 					pmap_free_pv_entry(pve);
 				}
 			}
 
 			if (pmap_is_current(pm)) {
 				total++;
 				if (total < PMAP_REMOVE_CLEAN_LIST_SIZE) {
 					if (is_exec)
 						cpu_tlb_flushID_SE(sva);
 					else if (is_refd)
 						cpu_tlb_flushD_SE(sva);
 				} else if (total == PMAP_REMOVE_CLEAN_LIST_SIZE) {
 					flushall = 1;
 				}
 			}
 			*ptep = 0;
 			PTE_SYNC(ptep);
 
 			sva += PAGE_SIZE;
 			ptep++;
 			mappings++;
 		}
 
 		pmap_free_l2_bucket(pm, l2b, mappings);
 	}
 
 	rw_wunlock(&pvh_global_lock);
 	if (flushall)
 		cpu_tlb_flushID();
 	PMAP_UNLOCK(pm);
 }
 
 /*
  * pmap_zero_page()
  *
  * Zero a given physical page by mapping it at a page hook point.
  * In doing the zero page op, the page we zero is mapped cachable, as with
  * StrongARM accesses to non-cached pages are non-burst making writing
  * _any_ bulk data very slow.
  */
 static void
 pmap_zero_page_gen(vm_page_t pg, int off, int size)
 {
 
 	vm_paddr_t phys = VM_PAGE_TO_PHYS(pg);
 	if (!TAILQ_EMPTY(&pg->md.pv_list))
 		panic("pmap_zero_page: page has mappings");
 
 	mtx_lock(&cmtx);
 	/*
 	 * Hook in the page, zero it, invalidate the TLB as needed.
 	 *
 	 * Note the temporary zero-page mapping must be a non-cached page in
 	 * order to work without corruption when write-allocate is enabled.
 	 */
 	*cdst_pte = L2_S_PROTO | phys | pte_l2_s_cache_mode;
 	pmap_set_prot(cdst_pte, VM_PROT_WRITE, 0);
 	PTE_SYNC(cdst_pte);
 	cpu_tlb_flushD_SE(cdstp);
 	cpu_cpwait();
 	if (off || size != PAGE_SIZE)
 		bzero((void *)(cdstp + off), size);
 	else
 		bzero_page(cdstp);
 
 	/*
 	 * Although aliasing is not possible if we use 
 	 * cdstp temporary mappings with memory that 
 	 * will be mapped later as non-cached or with write-through 
 	 * caches we might end up overwriting it when calling wbinv_all
 	 * So make sure caches are clean after copy operation
 	 */
 	cpu_idcache_wbinv_range(cdstp, size);
 	pmap_l2cache_wbinv_range(cdstp, phys, size);
 
 	mtx_unlock(&cmtx);
 }
 
 /*
  *	pmap_zero_page zeros the specified hardware page by mapping
  *	the page into KVM and using bzero to clear its contents.
  */
 void
 pmap_zero_page(vm_page_t m)
 {
 	pmap_zero_page_gen(m, 0, PAGE_SIZE);
 }
 
 
 /*
  *	pmap_zero_page_area zeros the specified hardware page by mapping
  *	the page into KVM and using bzero to clear its contents.
  *
  *	off and size may not cover an area beyond a single hardware page.
  */
 void
 pmap_zero_page_area(vm_page_t m, int off, int size)
 {
 
 	pmap_zero_page_gen(m, off, size);
 }
 
 
 /*
  *	pmap_zero_page_idle zeros the specified hardware page by mapping
  *	the page into KVM and using bzero to clear its contents.  This
  *	is intended to be called from the vm_pagezero process only and
  *	outside of Giant.
  */
 void
 pmap_zero_page_idle(vm_page_t m)
 {
 
 	pmap_zero_page(m);
 }
 
 /*
  *	pmap_copy_page copies the specified (machine independent)
  *	page by mapping the page into virtual memory and using
  *	bcopy to copy the page, one machine dependent page at a
  *	time.
  */
 
 /*
  * pmap_copy_page()
  *
  * Copy one physical page into another, by mapping the pages into
  * hook points. The same comment regarding cachability as in
  * pmap_zero_page also applies here.
  */
 void
 pmap_copy_page_generic(vm_paddr_t src, vm_paddr_t dst)
 {
 	/*
 	 * Hold the source page's lock for the duration of the copy
 	 * so that no other mappings can be created while we have a
 	 * potentially aliased mapping.
 	 * Map the pages into the page hook points, copy them, and purge
 	 * the cache for the appropriate page. Invalidate the TLB
 	 * as required.
 	 */
 	mtx_lock(&cmtx);
 
 	/* For ARMv6 using System bit is deprecated and mapping with AP
 	 * bits set to 0x0 makes page not accessible. csrc_pte is mapped
 	 * read/write until proper mapping defines are created for ARMv6.
 	 */
 	*csrc_pte = L2_S_PROTO | src | pte_l2_s_cache_mode;
 	pmap_set_prot(csrc_pte, VM_PROT_READ, 0);
 	PTE_SYNC(csrc_pte);
 
 	*cdst_pte = L2_S_PROTO | dst | pte_l2_s_cache_mode;
 	pmap_set_prot(cdst_pte, VM_PROT_READ | VM_PROT_WRITE, 0);
 	PTE_SYNC(cdst_pte);
 
 	cpu_tlb_flushD_SE(csrcp);
 	cpu_tlb_flushD_SE(cdstp);
 	cpu_cpwait();
 
 	/*
 	 * Although aliasing is not possible if we use 
 	 * cdstp temporary mappings with memory that 
 	 * will be mapped later as non-cached or with write-through 
 	 * caches we might end up overwriting it when calling wbinv_all
 	 * So make sure caches are clean after copy operation
 	 */
 	bcopy_page(csrcp, cdstp);
 
 	cpu_idcache_wbinv_range(cdstp, PAGE_SIZE);
 	pmap_l2cache_wbinv_range(cdstp, dst, PAGE_SIZE);
 
 	mtx_unlock(&cmtx);
 }
 
 void
 pmap_copy_page(vm_page_t src, vm_page_t dst)
 {
 
 	if (_arm_memcpy && PAGE_SIZE >= _min_memcpy_size &&
 	    _arm_memcpy((void *)VM_PAGE_TO_PHYS(dst),
 	    (void *)VM_PAGE_TO_PHYS(src), PAGE_SIZE, IS_PHYSICAL) == 0)
 		return;
 
 	pmap_copy_page_generic(VM_PAGE_TO_PHYS(src), VM_PAGE_TO_PHYS(dst));
 }
 
 /*
  * this routine returns true if a physical page resides
  * in the given pmap.
  */
 boolean_t
 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
 {
 	pv_entry_t pv;
 	int loops = 0;
 	boolean_t rv;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_page_exists_quick: page %p is not managed", m));
 	rv = FALSE;
 	rw_wlock(&pvh_global_lock);
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
 		if (pv->pv_pmap == pmap) {
 			rv = TRUE;
 			break;
 		}
 		loops++;
 		if (loops >= 16)
 			break;
 	}
 
 	rw_wunlock(&pvh_global_lock);
 	return (rv);
 }
 
 /*
  *	pmap_page_wired_mappings:
  *
  *	Return the number of managed mappings to the given physical page
  *	that are wired.
  */
 int
 pmap_page_wired_mappings(vm_page_t m)
 {
 	pv_entry_t pv;
 	int count;
 
 	count = 0;
 	if ((m->flags & PG_FICTITIOUS) != 0)
 		return (count);
 	rw_wlock(&pvh_global_lock);
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list)
 		if ((pv->pv_flags & PVF_WIRED) != 0)
 			count++;
 	rw_wunlock(&pvh_global_lock);
 	return (count);
 }
 
 /*
  *	pmap_is_referenced:
  *
  *	Return whether or not the specified physical page was referenced
  *	in any physical maps.
  */
 boolean_t
 pmap_is_referenced(vm_page_t m)
 {
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_is_referenced: page %p is not managed", m));
 	return ((m->md.pvh_attrs & PVF_REF) != 0);
 }
 
 /*
  *	pmap_ts_referenced:
  *
  *	Return the count of reference bits for a page, clearing all of them.
  */
 int
 pmap_ts_referenced(vm_page_t m)
 {
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_ts_referenced: page %p is not managed", m));
 	return (pmap_clearbit(m, PVF_REF));
 }
 
 
 boolean_t
 pmap_is_modified(vm_page_t m)
 {
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_is_modified: page %p is not managed", m));
 	if (m->md.pvh_attrs & PVF_MOD)
 		return (TRUE);
 
 	return(FALSE);
 }
 
 
 /*
  *	Clear the modify bits on the specified physical page.
  */
 void
 pmap_clear_modify(vm_page_t m)
 {
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_clear_modify: page %p is not managed", m));
 	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
 	KASSERT((m->oflags & VPO_BUSY) == 0,
 	    ("pmap_clear_modify: page %p is busy", m));
 
 	/*
 	 * If the page is not PGA_WRITEABLE, then no mappings can be modified.
 	 * If the object containing the page is locked and the page is not
 	 * VPO_BUSY, then PGA_WRITEABLE cannot be concurrently set.
 	 */
 	if ((m->aflags & PGA_WRITEABLE) == 0)
 		return;
 
 	if (m->md.pvh_attrs & PVF_MOD)
 		pmap_clearbit(m, PVF_MOD);
 }
 
 
 /*
  *	pmap_clear_reference:
  *
  *	Clear the reference bit on the specified physical page.
  */
 void
 pmap_clear_reference(vm_page_t m)
 {
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_clear_reference: page %p is not managed", m));
 	if (m->md.pvh_attrs & PVF_REF)
 		pmap_clearbit(m, PVF_REF);
 }
 
 
 /*
  * Clear the write and modified bits in each of the given page's mappings.
  */
 void
 pmap_remove_write(vm_page_t m)
 {
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_remove_write: page %p is not managed", m));
 
 	/*
 	 * If the page is not VPO_BUSY, then PGA_WRITEABLE cannot be set by
 	 * another thread while the object is locked.  Thus, if PGA_WRITEABLE
 	 * is clear, no page table entries need updating.
 	 */
 	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
 	if ((m->oflags & VPO_BUSY) != 0 ||
 	    (m->aflags & PGA_WRITEABLE) != 0)
 		pmap_clearbit(m, PVF_WRITE);
 }
 
 
 /*
  * perform the pmap work for mincore
  */
 int
 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa)
 {
 	struct l2_bucket *l2b;
 	pt_entry_t *ptep, pte;
 	vm_paddr_t pa;
 	vm_page_t m;
 	int val;
 	boolean_t managed;
 
 	PMAP_LOCK(pmap);
 retry:
 	l2b = pmap_get_l2_bucket(pmap, addr);
 	if (l2b == NULL) {
 		val = 0;
 		goto out;
 	}
 	ptep = &l2b->l2b_kva[l2pte_index(addr)];
 	pte = *ptep;
 	if (!l2pte_valid(pte)) {
 		val = 0;
 		goto out;
 	}
 	val = MINCORE_INCORE;
 	if (L2_S_WRITABLE(pte))
 		val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
 	managed = FALSE;
 	pa = l2pte_pa(pte);
 	m = PHYS_TO_VM_PAGE(pa);
 	if (m != NULL && (m->oflags & VPO_UNMANAGED) == 0)
 		managed = TRUE;
 	if (managed) {
 		/*
 		 * The ARM pmap tries to maintain a per-mapping
 		 * reference bit.  The trouble is that it's kept in
 		 * the PV entry, not the PTE, so it's costly to access
 		 * here.  You would need to acquire the pvh global
 		 * lock, call pmap_find_pv(), and introduce a custom
 		 * version of vm_page_pa_tryrelock() that releases and
 		 * reacquires the pvh global lock.  In the end, I
 		 * doubt it's worthwhile.  This may falsely report
 		 * the given address as referenced.
 		 */
 		if ((m->md.pvh_attrs & PVF_REF) != 0)
 			val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
 	}
 	if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
 	    (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) {
 		/* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */
 		if (vm_page_pa_tryrelock(pmap, pa, locked_pa))
 			goto retry;
 	} else
 out:
 		PA_UNLOCK_COND(*locked_pa);
 	PMAP_UNLOCK(pmap);
 	return (val);
 }
 
 void
 pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz)
 {
 }
 
 /*
  *	Increase the starting virtual address of the given mapping if a
  *	different alignment might result in more superpage mappings.
  */
 void
 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
     vm_offset_t *addr, vm_size_t size)
 {
 }
 
 
 /*
  * Map a set of physical memory pages into the kernel virtual
  * address space. Return a pointer to where it is mapped. This
  * routine is intended to be used for mapping device memory,
  * NOT real memory.
  */
 void *
 pmap_mapdev(vm_offset_t pa, vm_size_t size)
 {
 	vm_offset_t va, tmpva, offset;
 
 	offset = pa & PAGE_MASK;
 	size = roundup(size, PAGE_SIZE);
 
 	GIANT_REQUIRED;
 
 	va = kmem_alloc_nofault(kernel_map, size);
 	if (!va)
 		panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
 	for (tmpva = va; size > 0;) {
 		pmap_kenter_internal(tmpva, pa, 0);
 		size -= PAGE_SIZE;
 		tmpva += PAGE_SIZE;
 		pa += PAGE_SIZE;
 	}
 
 	return ((void *)(va + offset));
 }
 
 /*
  * pmap_map_section:
  *
  *	Create a single section mapping.
  */
 void
 pmap_map_section(vm_offset_t l1pt, vm_offset_t va, vm_offset_t pa,
     int prot, int cache)
 {
 	pd_entry_t *pde = (pd_entry_t *) l1pt;
 	pd_entry_t fl;
 
 	KASSERT(((va | pa) & L1_S_OFFSET) == 0, ("ouin2"));
 
 	fl = l1_mem_types[cache];
 
 	pde[va >> L1_S_SHIFT] = L1_S_PROTO | pa |
 	    L1_S_PROT(PTE_KERNEL, prot) | fl | L1_S_DOM(PMAP_DOMAIN_KERNEL);
 	PTE_SYNC(&pde[va >> L1_S_SHIFT]);
 }
 
 /*
  * pmap_link_l2pt:
  *
  *	Link the L2 page table specified by l2pv.pv_pa into the L1
  *	page table at the slot for "va".
  */
 void
 pmap_link_l2pt(vm_offset_t l1pt, vm_offset_t va, struct pv_addr *l2pv)
 {
 	pd_entry_t *pde = (pd_entry_t *) l1pt, proto;
 	u_int slot = va >> L1_S_SHIFT;
 
 	proto = L1_S_DOM(PMAP_DOMAIN_KERNEL) | L1_C_PROTO;
 
 #ifdef VERBOSE_INIT_ARM
 	printf("pmap_link_l2pt: pa=0x%x va=0x%x\n", l2pv->pv_pa, l2pv->pv_va);
 #endif
 
 	pde[slot + 0] = proto | (l2pv->pv_pa + 0x000);
 	PTE_SYNC(&pde[slot]);
 
 	SLIST_INSERT_HEAD(&kernel_pt_list, l2pv, pv_list);
 
 }
 
 /*
  * pmap_map_entry
  *
  *	Create a single page mapping.
  */
 void
 pmap_map_entry(vm_offset_t l1pt, vm_offset_t va, vm_offset_t pa, int prot,
     int cache)
 {
 	pd_entry_t *pde = (pd_entry_t *) l1pt;
 	pt_entry_t fl;
 	pt_entry_t *pte;
 
 	KASSERT(((va | pa) & PAGE_MASK) == 0, ("ouin"));
 
 	fl = l2s_mem_types[cache];
 
 	if ((pde[va >> L1_S_SHIFT] & L1_TYPE_MASK) != L1_TYPE_C)
 		panic("pmap_map_entry: no L2 table for VA 0x%08x", va);
 
 	pte = (pt_entry_t *) kernel_pt_lookup(pde[L1_IDX(va)] & L1_C_ADDR_MASK);
 
 	if (pte == NULL)
 		panic("pmap_map_entry: can't find L2 table for VA 0x%08x", va);
 
 	pte[l2pte_index(va)] = L2_S_PROTO | pa | fl;
 	pmap_set_prot(&pte[l2pte_index(va)], prot, 0);
 	PTE_SYNC(&pte[l2pte_index(va)]);
 }
 
 /*
  * pmap_map_chunk:
  *
  *	Map a chunk of memory using the most efficient mappings
  *	possible (section. large page, small page) into the
  *	provided L1 and L2 tables at the specified virtual address.
  */
 vm_size_t
 pmap_map_chunk(vm_offset_t l1pt, vm_offset_t va, vm_offset_t pa,
     vm_size_t size, int prot, int type)
 {
 	pd_entry_t *pde = (pd_entry_t *) l1pt;
 	pt_entry_t *pte, f1, f2s, f2l;
 	vm_size_t resid;
 	int i;
 
 	resid = (size + (PAGE_SIZE - 1)) & ~(PAGE_SIZE - 1);
 
 	if (l1pt == 0)
 		panic("pmap_map_chunk: no L1 table provided");
 
 #ifdef VERBOSE_INIT_ARM
 	printf("pmap_map_chunk: pa=0x%x va=0x%x size=0x%x resid=0x%x "
 	    "prot=0x%x type=%d\n", pa, va, size, resid, prot, type);
 #endif
 
 	f1 = l1_mem_types[type];
 	f2l = l2l_mem_types[type];
 	f2s = l2s_mem_types[type];
 
 	size = resid;
 
 	while (resid > 0) {
 		/* See if we can use a section mapping. */
 		if (L1_S_MAPPABLE_P(va, pa, resid)) {
 #ifdef VERBOSE_INIT_ARM
 			printf("S");
 #endif
 			pde[va >> L1_S_SHIFT] = L1_S_PROTO | pa |
 			    L1_S_PROT(PTE_KERNEL, prot) | f1 |
 			    L1_S_DOM(PMAP_DOMAIN_KERNEL);
 			PTE_SYNC(&pde[va >> L1_S_SHIFT]);
 			va += L1_S_SIZE;
 			pa += L1_S_SIZE;
 			resid -= L1_S_SIZE;
 			continue;
 		}
 
 		/*
 		 * Ok, we're going to use an L2 table.  Make sure
 		 * one is actually in the corresponding L1 slot
 		 * for the current VA.
 		 */
 		if ((pde[va >> L1_S_SHIFT] & L1_TYPE_MASK) != L1_TYPE_C)
 			panic("pmap_map_chunk: no L2 table for VA 0x%08x", va);
 
 		pte = (pt_entry_t *) kernel_pt_lookup(
 		    pde[L1_IDX(va)] & L1_C_ADDR_MASK);
 		if (pte == NULL)
 			panic("pmap_map_chunk: can't find L2 table for VA"
 			    "0x%08x", va);
 		/* See if we can use a L2 large page mapping. */
 		if (L2_L_MAPPABLE_P(va, pa, resid)) {
 #ifdef VERBOSE_INIT_ARM
 			printf("L");
 #endif
 			for (i = 0; i < 16; i++) {
 				pte[l2pte_index(va) + i] =
 				    L2_L_PROTO | pa |
 				    L2_L_PROT(PTE_KERNEL, prot) | f2l;
 				PTE_SYNC(&pte[l2pte_index(va) + i]);
 			}
 			va += L2_L_SIZE;
 			pa += L2_L_SIZE;
 			resid -= L2_L_SIZE;
 			continue;
 		}
 
 		/* Use a small page mapping. */
 #ifdef VERBOSE_INIT_ARM
 		printf("P");
 #endif
 		pte[l2pte_index(va)] = L2_S_PROTO | pa | f2s;
 		pmap_set_prot(&pte[l2pte_index(va)], prot, 0);
 		PTE_SYNC(&pte[l2pte_index(va)]);
 		va += PAGE_SIZE;
 		pa += PAGE_SIZE;
 		resid -= PAGE_SIZE;
 	}
 #ifdef VERBOSE_INIT_ARM
 	printf("\n");
 #endif
 	return (size);
 
 }
 
 /********************** Static device map routines ***************************/
 
 static const struct pmap_devmap *pmap_devmap_table;
 
 /*
  * Register the devmap table.  This is provided in case early console
  * initialization needs to register mappings created by bootstrap code
  * before pmap_devmap_bootstrap() is called.
  */
 void
 pmap_devmap_register(const struct pmap_devmap *table)
 {
 
 	pmap_devmap_table = table;
 }
 
 /*
  * Map all of the static regions in the devmap table, and remember
  * the devmap table so other parts of the kernel can look up entries
  * later.
  */
 void
 pmap_devmap_bootstrap(vm_offset_t l1pt, const struct pmap_devmap *table)
 {
 	int i;
 
 	pmap_devmap_table = table;
 
 	for (i = 0; pmap_devmap_table[i].pd_size != 0; i++) {
 #ifdef VERBOSE_INIT_ARM
 		printf("devmap: %08x -> %08x @ %08x\n",
 		    pmap_devmap_table[i].pd_pa,
 		    pmap_devmap_table[i].pd_pa +
 			pmap_devmap_table[i].pd_size - 1,
 		    pmap_devmap_table[i].pd_va);
 #endif
 		pmap_map_chunk(l1pt, pmap_devmap_table[i].pd_va,
 		    pmap_devmap_table[i].pd_pa,
 		    pmap_devmap_table[i].pd_size,
 		    pmap_devmap_table[i].pd_prot,
 		    pmap_devmap_table[i].pd_cache);
 	}
 }
 
 const struct pmap_devmap *
 pmap_devmap_find_pa(vm_paddr_t pa, vm_size_t size)
 {
 	int i;
 
 	if (pmap_devmap_table == NULL)
 		return (NULL);
 
 	for (i = 0; pmap_devmap_table[i].pd_size != 0; i++) {
 		if (pa >= pmap_devmap_table[i].pd_pa &&
 		    pa + size <= pmap_devmap_table[i].pd_pa +
 				 pmap_devmap_table[i].pd_size)
 			return (&pmap_devmap_table[i]);
 	}
 
 	return (NULL);
 }
 
 const struct pmap_devmap *
 pmap_devmap_find_va(vm_offset_t va, vm_size_t size)
 {
 	int i;
 
 	if (pmap_devmap_table == NULL)
 		return (NULL);
 
 	for (i = 0; pmap_devmap_table[i].pd_size != 0; i++) {
 		if (va >= pmap_devmap_table[i].pd_va &&
 		    va + size <= pmap_devmap_table[i].pd_va +
 				 pmap_devmap_table[i].pd_size)
 			return (&pmap_devmap_table[i]);
 	}
 
 	return (NULL);
 }
 
 int
 pmap_dmap_iscurrent(pmap_t pmap)
 {
 	return(pmap_is_current(pmap));
 }
 
 void
 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
 {
 	/* 
 	 * Remember the memattr in a field that gets used to set the appropriate
 	 * bits in the PTEs as mappings are established.
 	 */
 	m->md.pv_memattr = ma;
 
 	/*
 	 * It appears that this function can only be called before any mappings
 	 * for the page are established on ARM.  If this ever changes, this code
 	 * will need to walk the pv_list and make each of the existing mappings
 	 * uncacheable, being careful to sync caches and PTEs (and maybe
 	 * invalidate TLB?) for any current mapping it modifies.
 	 */
 	if (m->md.pv_kva != 0 || TAILQ_FIRST(&m->md.pv_list) != NULL)
 		panic("Can't change memattr on page with existing mappings");
 }
Index: user/attilio/vmc-playground/sys/arm/arm/pmap.c
===================================================================
--- user/attilio/vmc-playground/sys/arm/arm/pmap.c	(revision 247223)
+++ user/attilio/vmc-playground/sys/arm/arm/pmap.c	(revision 247224)
@@ -1,4959 +1,4958 @@
 /* From: $NetBSD: pmap.c,v 1.148 2004/04/03 04:35:48 bsh Exp $ */
 /*-
  * Copyright 2004 Olivier Houchard.
  * Copyright 2003 Wasabi Systems, Inc.
  * All rights reserved.
  *
  * Written by Steve C. Woodford for Wasabi Systems, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *      This product includes software developed for the NetBSD Project by
  *      Wasabi Systems, Inc.
  * 4. The name of Wasabi Systems, Inc. may not be used to endorse
  *    or promote products derived from this software without specific prior
  *    written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
 /*-
  * Copyright (c) 2002-2003 Wasabi Systems, Inc.
  * Copyright (c) 2001 Richard Earnshaw
  * Copyright (c) 2001-2002 Christopher Gilbert
  * All rights reserved.
  *
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the company nor the name of the author may be used to
  *    endorse or promote products derived from this software without specific
  *    prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 /*-
  * Copyright (c) 1999 The NetBSD Foundation, Inc.
  * All rights reserved.
  *
  * This code is derived from software contributed to The NetBSD Foundation
  * by Charles M. Hannum.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
 /*-
  * Copyright (c) 1994-1998 Mark Brinicombe.
  * Copyright (c) 1994 Brini.
  * All rights reserved.
  *
  * This code is derived from software written for Brini by Mark Brinicombe
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *      This product includes software developed by Mark Brinicombe.
  * 4. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  *
  * RiscBSD kernel project
  *
  * pmap.c
  *
  * Machine dependant vm stuff
  *
  * Created      : 20/09/94
  */
 
 /*
  * Special compilation symbols
  * PMAP_DEBUG           - Build in pmap_debug_level code
  */
 /* Include header files */
 
 #include "opt_vm.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/proc.h>
 #include <sys/malloc.h>
 #include <sys/msgbuf.h>
 #include <sys/mutex.h>
 #include <sys/vmmeter.h>
 #include <sys/mman.h>
 #include <sys/rwlock.h>
 #include <sys/smp.h>
 #include <sys/sched.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/uma.h>
 #include <vm/pmap.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_phys.h>
 #include <vm/vm_extern.h>
 
 #include <machine/md_var.h>
 #include <machine/cpu.h>
 #include <machine/cpufunc.h>
 #include <machine/pcb.h>
 
 #ifdef PMAP_DEBUG
 #define PDEBUG(_lev_,_stat_) \
         if (pmap_debug_level >= (_lev_)) \
                 ((_stat_))
 #define dprintf printf
 
 int pmap_debug_level = 0;
 #define PMAP_INLINE
 #else   /* PMAP_DEBUG */
 #define PDEBUG(_lev_,_stat_) /* Nothing */
 #define dprintf(x, arg...)
 #define PMAP_INLINE __inline
 #endif  /* PMAP_DEBUG */
 
 extern struct pv_addr systempage;
 
 extern int last_fault_code;
 
 /*
  * Internal function prototypes
  */
 static void pmap_free_pv_entry (pv_entry_t);
 static pv_entry_t pmap_get_pv_entry(void);
 
 static void		pmap_enter_locked(pmap_t, vm_offset_t, vm_page_t,
     vm_prot_t, boolean_t, int);
 static vm_paddr_t	pmap_extract_locked(pmap_t pmap, vm_offset_t va);
 static void		pmap_fix_cache(struct vm_page *, pmap_t, vm_offset_t);
 static void		pmap_alloc_l1(pmap_t);
 static void		pmap_free_l1(pmap_t);
 
 static int		pmap_clearbit(struct vm_page *, u_int);
 
 static struct l2_bucket *pmap_get_l2_bucket(pmap_t, vm_offset_t);
 static struct l2_bucket *pmap_alloc_l2_bucket(pmap_t, vm_offset_t);
 static void		pmap_free_l2_bucket(pmap_t, struct l2_bucket *, u_int);
 static vm_offset_t	kernel_pt_lookup(vm_paddr_t);
 
 static MALLOC_DEFINE(M_VMPMAP, "pmap", "PMAP L1");
 
 vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
 vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
 vm_offset_t pmap_curmaxkvaddr;
 vm_paddr_t kernel_l1pa;
 
 vm_offset_t kernel_vm_end = 0;
 
 struct pmap kernel_pmap_store;
 
 static pt_entry_t *csrc_pte, *cdst_pte;
 static vm_offset_t csrcp, cdstp;
 static struct mtx cmtx;
 
 static void		pmap_init_l1(struct l1_ttable *, pd_entry_t *);
 /*
  * These routines are called when the CPU type is identified to set up
  * the PTE prototypes, cache modes, etc.
  *
  * The variables are always here, just in case LKMs need to reference
  * them (though, they shouldn't).
  */
 
 pt_entry_t	pte_l1_s_cache_mode;
 pt_entry_t	pte_l1_s_cache_mode_pt;
 pt_entry_t	pte_l1_s_cache_mask;
 
 pt_entry_t	pte_l2_l_cache_mode;
 pt_entry_t	pte_l2_l_cache_mode_pt;
 pt_entry_t	pte_l2_l_cache_mask;
 
 pt_entry_t	pte_l2_s_cache_mode;
 pt_entry_t	pte_l2_s_cache_mode_pt;
 pt_entry_t	pte_l2_s_cache_mask;
 
 pt_entry_t	pte_l2_s_prot_u;
 pt_entry_t	pte_l2_s_prot_w;
 pt_entry_t	pte_l2_s_prot_mask;
 
 pt_entry_t	pte_l1_s_proto;
 pt_entry_t	pte_l1_c_proto;
 pt_entry_t	pte_l2_s_proto;
 
 void		(*pmap_copy_page_func)(vm_paddr_t, vm_paddr_t);
 void		(*pmap_zero_page_func)(vm_paddr_t, int, int);
 
 struct msgbuf *msgbufp = 0;
 
 /*
  * Crashdump maps.
  */
 static caddr_t crashdumpmap;
 
 extern void bcopy_page(vm_offset_t, vm_offset_t);
 extern void bzero_page(vm_offset_t);
 
 extern vm_offset_t alloc_firstaddr;
 
 char *_tmppt;
 
 /*
  * Metadata for L1 translation tables.
  */
 struct l1_ttable {
 	/* Entry on the L1 Table list */
 	SLIST_ENTRY(l1_ttable) l1_link;
 
 	/* Entry on the L1 Least Recently Used list */
 	TAILQ_ENTRY(l1_ttable) l1_lru;
 
 	/* Track how many domains are allocated from this L1 */
 	volatile u_int l1_domain_use_count;
 
 	/*
 	 * A free-list of domain numbers for this L1.
 	 * We avoid using ffs() and a bitmap to track domains since ffs()
 	 * is slow on ARM.
 	 */
 	u_int8_t l1_domain_first;
 	u_int8_t l1_domain_free[PMAP_DOMAINS];
 
 	/* Physical address of this L1 page table */
 	vm_paddr_t l1_physaddr;
 
 	/* KVA of this L1 page table */
 	pd_entry_t *l1_kva;
 };
 
 /*
  * Convert a virtual address into its L1 table index. That is, the
  * index used to locate the L2 descriptor table pointer in an L1 table.
  * This is basically used to index l1->l1_kva[].
  *
  * Each L2 descriptor table represents 1MB of VA space.
  */
 #define	L1_IDX(va)		(((vm_offset_t)(va)) >> L1_S_SHIFT)
 
 /*
  * L1 Page Tables are tracked using a Least Recently Used list.
  *  - New L1s are allocated from the HEAD.
  *  - Freed L1s are added to the TAIl.
  *  - Recently accessed L1s (where an 'access' is some change to one of
  *    the userland pmaps which owns this L1) are moved to the TAIL.
  */
 static TAILQ_HEAD(, l1_ttable) l1_lru_list;
 /*
  * A list of all L1 tables
  */
 static SLIST_HEAD(, l1_ttable) l1_list;
 static struct mtx l1_lru_lock;
 
 /*
  * The l2_dtable tracks L2_BUCKET_SIZE worth of L1 slots.
  *
  * This is normally 16MB worth L2 page descriptors for any given pmap.
  * Reference counts are maintained for L2 descriptors so they can be
  * freed when empty.
  */
 struct l2_dtable {
 	/* The number of L2 page descriptors allocated to this l2_dtable */
 	u_int l2_occupancy;
 
 	/* List of L2 page descriptors */
 	struct l2_bucket {
 		pt_entry_t *l2b_kva;	/* KVA of L2 Descriptor Table */
 		vm_paddr_t l2b_phys;	/* Physical address of same */
 		u_short l2b_l1idx;	/* This L2 table's L1 index */
 		u_short l2b_occupancy;	/* How many active descriptors */
 	} l2_bucket[L2_BUCKET_SIZE];
 };
 
 /* pmap_kenter_internal flags */
 #define KENTER_CACHE	0x1
 #define KENTER_USER	0x2
 
 /*
  * Given an L1 table index, calculate the corresponding l2_dtable index
  * and bucket index within the l2_dtable.
  */
 #define	L2_IDX(l1idx)		(((l1idx) >> L2_BUCKET_LOG2) & \
 				 (L2_SIZE - 1))
 #define	L2_BUCKET(l1idx)	((l1idx) & (L2_BUCKET_SIZE - 1))
 
 /*
  * Given a virtual address, this macro returns the
  * virtual address required to drop into the next L2 bucket.
  */
 #define	L2_NEXT_BUCKET(va)	(((va) & L1_S_FRAME) + L1_S_SIZE)
 
 /*
  * We try to map the page tables write-through, if possible.  However, not
  * all CPUs have a write-through cache mode, so on those we have to sync
  * the cache when we frob page tables.
  *
  * We try to evaluate this at compile time, if possible.  However, it's
  * not always possible to do that, hence this run-time var.
  */
 int	pmap_needs_pte_sync;
 
 /*
  * Macro to determine if a mapping might be resident in the
  * instruction cache and/or TLB
  */
 #define	PV_BEEN_EXECD(f)  (((f) & (PVF_REF | PVF_EXEC)) == (PVF_REF | PVF_EXEC))
 
 /*
  * Macro to determine if a mapping might be resident in the
  * data cache and/or TLB
  */
 #define	PV_BEEN_REFD(f)   (((f) & PVF_REF) != 0)
 
 #ifndef PMAP_SHPGPERPROC
 #define PMAP_SHPGPERPROC 200
 #endif
 
 #define pmap_is_current(pm)	((pm) == pmap_kernel() || \
             curproc->p_vmspace->vm_map.pmap == (pm))
 static uma_zone_t pvzone = NULL;
 uma_zone_t l2zone;
 static uma_zone_t l2table_zone;
 static vm_offset_t pmap_kernel_l2dtable_kva;
 static vm_offset_t pmap_kernel_l2ptp_kva;
 static vm_paddr_t pmap_kernel_l2ptp_phys;
-static struct vm_object pvzone_obj;
 static int pv_entry_count=0, pv_entry_max=0, pv_entry_high_water=0;
 static struct rwlock pvh_global_lock;
 
 /*
  * This list exists for the benefit of pmap_map_chunk().  It keeps track
  * of the kernel L2 tables during bootstrap, so that pmap_map_chunk() can
  * find them as necessary.
  *
  * Note that the data on this list MUST remain valid after initarm() returns,
  * as pmap_bootstrap() uses it to contruct L2 table metadata.
  */
 SLIST_HEAD(, pv_addr) kernel_pt_list = SLIST_HEAD_INITIALIZER(kernel_pt_list);
 
 static void
 pmap_init_l1(struct l1_ttable *l1, pd_entry_t *l1pt)
 {
 	int i;
 
 	l1->l1_kva = l1pt;
 	l1->l1_domain_use_count = 0;
 	l1->l1_domain_first = 0;
 
 	for (i = 0; i < PMAP_DOMAINS; i++)
 		l1->l1_domain_free[i] = i + 1;
 
 	/*
 	 * Copy the kernel's L1 entries to each new L1.
 	 */
 	if (l1pt != pmap_kernel()->pm_l1->l1_kva)
 		memcpy(l1pt, pmap_kernel()->pm_l1->l1_kva, L1_TABLE_SIZE);
 
 	if ((l1->l1_physaddr = pmap_extract(pmap_kernel(), (vm_offset_t)l1pt)) == 0)
 		panic("pmap_init_l1: can't get PA of L1 at %p", l1pt);
 	SLIST_INSERT_HEAD(&l1_list, l1, l1_link);
 	TAILQ_INSERT_TAIL(&l1_lru_list, l1, l1_lru);
 }
 
 static vm_offset_t
 kernel_pt_lookup(vm_paddr_t pa)
 {
 	struct pv_addr *pv;
 
 	SLIST_FOREACH(pv, &kernel_pt_list, pv_list) {
 		if (pv->pv_pa == pa)
 			return (pv->pv_va);
 	}
 	return (0);
 }
 
 #if (ARM_MMU_GENERIC + ARM_MMU_SA1) != 0
 void
 pmap_pte_init_generic(void)
 {
 
 	pte_l1_s_cache_mode = L1_S_B|L1_S_C;
 	pte_l1_s_cache_mask = L1_S_CACHE_MASK_generic;
 
 	pte_l2_l_cache_mode = L2_B|L2_C;
 	pte_l2_l_cache_mask = L2_L_CACHE_MASK_generic;
 
 	pte_l2_s_cache_mode = L2_B|L2_C;
 	pte_l2_s_cache_mask = L2_S_CACHE_MASK_generic;
 
 	/*
 	 * If we have a write-through cache, set B and C.  If
 	 * we have a write-back cache, then we assume setting
 	 * only C will make those pages write-through.
 	 */
 	if (cpufuncs.cf_dcache_wb_range == (void *) cpufunc_nullop) {
 		pte_l1_s_cache_mode_pt = L1_S_B|L1_S_C;
 		pte_l2_l_cache_mode_pt = L2_B|L2_C;
 		pte_l2_s_cache_mode_pt = L2_B|L2_C;
 	} else {
 		pte_l1_s_cache_mode_pt = L1_S_C;
 		pte_l2_l_cache_mode_pt = L2_C;
 		pte_l2_s_cache_mode_pt = L2_C;
 	}
 
 	pte_l2_s_prot_u = L2_S_PROT_U_generic;
 	pte_l2_s_prot_w = L2_S_PROT_W_generic;
 	pte_l2_s_prot_mask = L2_S_PROT_MASK_generic;
 
 	pte_l1_s_proto = L1_S_PROTO_generic;
 	pte_l1_c_proto = L1_C_PROTO_generic;
 	pte_l2_s_proto = L2_S_PROTO_generic;
 
 	pmap_copy_page_func = pmap_copy_page_generic;
 	pmap_zero_page_func = pmap_zero_page_generic;
 }
 
 #if defined(CPU_ARM8)
 void
 pmap_pte_init_arm8(void)
 {
 
 	/*
 	 * ARM8 is compatible with generic, but we need to use
 	 * the page tables uncached.
 	 */
 	pmap_pte_init_generic();
 
 	pte_l1_s_cache_mode_pt = 0;
 	pte_l2_l_cache_mode_pt = 0;
 	pte_l2_s_cache_mode_pt = 0;
 }
 #endif /* CPU_ARM8 */
 
 #if defined(CPU_ARM9) && defined(ARM9_CACHE_WRITE_THROUGH)
 void
 pmap_pte_init_arm9(void)
 {
 
 	/*
 	 * ARM9 is compatible with generic, but we want to use
 	 * write-through caching for now.
 	 */
 	pmap_pte_init_generic();
 
 	pte_l1_s_cache_mode = L1_S_C;
 	pte_l2_l_cache_mode = L2_C;
 	pte_l2_s_cache_mode = L2_C;
 
 	pte_l1_s_cache_mode_pt = L1_S_C;
 	pte_l2_l_cache_mode_pt = L2_C;
 	pte_l2_s_cache_mode_pt = L2_C;
 }
 #endif /* CPU_ARM9 */
 #endif /* (ARM_MMU_GENERIC + ARM_MMU_SA1) != 0 */
 
 #if defined(CPU_ARM10)
 void
 pmap_pte_init_arm10(void)
 {
 
 	/*
 	 * ARM10 is compatible with generic, but we want to use
 	 * write-through caching for now.
 	 */
 	pmap_pte_init_generic();
 
 	pte_l1_s_cache_mode = L1_S_B | L1_S_C;
 	pte_l2_l_cache_mode = L2_B | L2_C;
 	pte_l2_s_cache_mode = L2_B | L2_C;
 
 	pte_l1_s_cache_mode_pt = L1_S_C;
 	pte_l2_l_cache_mode_pt = L2_C;
 	pte_l2_s_cache_mode_pt = L2_C;
 
 }
 #endif /* CPU_ARM10 */
 
 #if  ARM_MMU_SA1 == 1
 void
 pmap_pte_init_sa1(void)
 {
 
 	/*
 	 * The StrongARM SA-1 cache does not have a write-through
 	 * mode.  So, do the generic initialization, then reset
 	 * the page table cache mode to B=1,C=1, and note that
 	 * the PTEs need to be sync'd.
 	 */
 	pmap_pte_init_generic();
 
 	pte_l1_s_cache_mode_pt = L1_S_B|L1_S_C;
 	pte_l2_l_cache_mode_pt = L2_B|L2_C;
 	pte_l2_s_cache_mode_pt = L2_B|L2_C;
 
 	pmap_needs_pte_sync = 1;
 }
 #endif /* ARM_MMU_SA1 == 1*/
 
 #if ARM_MMU_XSCALE == 1
 #if (ARM_NMMUS > 1) || defined (CPU_XSCALE_CORE3)
 static u_int xscale_use_minidata;
 #endif
 
 void
 pmap_pte_init_xscale(void)
 {
 	uint32_t auxctl;
 	int write_through = 0;
 
 	pte_l1_s_cache_mode = L1_S_B|L1_S_C|L1_S_XSCALE_P;
 	pte_l1_s_cache_mask = L1_S_CACHE_MASK_xscale;
 
 	pte_l2_l_cache_mode = L2_B|L2_C;
 	pte_l2_l_cache_mask = L2_L_CACHE_MASK_xscale;
 
 	pte_l2_s_cache_mode = L2_B|L2_C;
 	pte_l2_s_cache_mask = L2_S_CACHE_MASK_xscale;
 
 	pte_l1_s_cache_mode_pt = L1_S_C;
 	pte_l2_l_cache_mode_pt = L2_C;
 	pte_l2_s_cache_mode_pt = L2_C;
 #ifdef XSCALE_CACHE_READ_WRITE_ALLOCATE
 	/*
 	 * The XScale core has an enhanced mode where writes that
 	 * miss the cache cause a cache line to be allocated.  This
 	 * is significantly faster than the traditional, write-through
 	 * behavior of this case.
 	 */
 	pte_l1_s_cache_mode |= L1_S_XSCALE_TEX(TEX_XSCALE_X);
 	pte_l2_l_cache_mode |= L2_XSCALE_L_TEX(TEX_XSCALE_X);
 	pte_l2_s_cache_mode |= L2_XSCALE_T_TEX(TEX_XSCALE_X);
 #endif /* XSCALE_CACHE_READ_WRITE_ALLOCATE */
 #ifdef XSCALE_CACHE_WRITE_THROUGH
 	/*
 	 * Some versions of the XScale core have various bugs in
 	 * their cache units, the work-around for which is to run
 	 * the cache in write-through mode.  Unfortunately, this
 	 * has a major (negative) impact on performance.  So, we
 	 * go ahead and run fast-and-loose, in the hopes that we
 	 * don't line up the planets in a way that will trip the
 	 * bugs.
 	 *
 	 * However, we give you the option to be slow-but-correct.
 	 */
 	write_through = 1;
 #elif defined(XSCALE_CACHE_WRITE_BACK)
 	/* force write back cache mode */
 	write_through = 0;
 #elif defined(CPU_XSCALE_PXA2X0)
 	/*
 	 * Intel PXA2[15]0 processors are known to have a bug in
 	 * write-back cache on revision 4 and earlier (stepping
 	 * A[01] and B[012]).  Fixed for C0 and later.
 	 */
 	{
 		uint32_t id, type;
 
 		id = cpufunc_id();
 		type = id & ~(CPU_ID_XSCALE_COREREV_MASK|CPU_ID_REVISION_MASK);
 
 		if (type == CPU_ID_PXA250 || type == CPU_ID_PXA210) {
 			if ((id & CPU_ID_REVISION_MASK) < 5) {
 				/* write through for stepping A0-1 and B0-2 */
 				write_through = 1;
 			}
 		}
 	}
 #endif /* XSCALE_CACHE_WRITE_THROUGH */
 
 	if (write_through) {
 		pte_l1_s_cache_mode = L1_S_C;
 		pte_l2_l_cache_mode = L2_C;
 		pte_l2_s_cache_mode = L2_C;
 	}
 
 #if (ARM_NMMUS > 1)
 	xscale_use_minidata = 1;
 #endif
 
 	pte_l2_s_prot_u = L2_S_PROT_U_xscale;
 	pte_l2_s_prot_w = L2_S_PROT_W_xscale;
 	pte_l2_s_prot_mask = L2_S_PROT_MASK_xscale;
 
 	pte_l1_s_proto = L1_S_PROTO_xscale;
 	pte_l1_c_proto = L1_C_PROTO_xscale;
 	pte_l2_s_proto = L2_S_PROTO_xscale;
 
 #ifdef CPU_XSCALE_CORE3
 	pmap_copy_page_func = pmap_copy_page_generic;
 	pmap_zero_page_func = pmap_zero_page_generic;
 	xscale_use_minidata = 0;
 	/* Make sure it is L2-cachable */
     	pte_l1_s_cache_mode |= L1_S_XSCALE_TEX(TEX_XSCALE_T);
 	pte_l1_s_cache_mode_pt = pte_l1_s_cache_mode &~ L1_S_XSCALE_P;
 	pte_l2_l_cache_mode |= L2_XSCALE_L_TEX(TEX_XSCALE_T) ;
 	pte_l2_l_cache_mode_pt = pte_l1_s_cache_mode;
 	pte_l2_s_cache_mode |= L2_XSCALE_T_TEX(TEX_XSCALE_T);
 	pte_l2_s_cache_mode_pt = pte_l2_s_cache_mode;
 
 #else
 	pmap_copy_page_func = pmap_copy_page_xscale;
 	pmap_zero_page_func = pmap_zero_page_xscale;
 #endif
 
 	/*
 	 * Disable ECC protection of page table access, for now.
 	 */
 	__asm __volatile("mrc p15, 0, %0, c1, c0, 1" : "=r" (auxctl));
 	auxctl &= ~XSCALE_AUXCTL_P;
 	__asm __volatile("mcr p15, 0, %0, c1, c0, 1" : : "r" (auxctl));
 }
 
 /*
  * xscale_setup_minidata:
  *
  *	Set up the mini-data cache clean area.  We require the
  *	caller to allocate the right amount of physically and
  *	virtually contiguous space.
  */
 extern vm_offset_t xscale_minidata_clean_addr;
 extern vm_size_t xscale_minidata_clean_size; /* already initialized */
 void
 xscale_setup_minidata(vm_offset_t l1pt, vm_offset_t va, vm_paddr_t pa)
 {
 	pd_entry_t *pde = (pd_entry_t *) l1pt;
 	pt_entry_t *pte;
 	vm_size_t size;
 	uint32_t auxctl;
 
 	xscale_minidata_clean_addr = va;
 
 	/* Round it to page size. */
 	size = (xscale_minidata_clean_size + L2_S_OFFSET) & L2_S_FRAME;
 
 	for (; size != 0;
 	     va += L2_S_SIZE, pa += L2_S_SIZE, size -= L2_S_SIZE) {
 		pte = (pt_entry_t *) kernel_pt_lookup(
 		    pde[L1_IDX(va)] & L1_C_ADDR_MASK);
 		if (pte == NULL)
 			panic("xscale_setup_minidata: can't find L2 table for "
 			    "VA 0x%08x", (u_int32_t) va);
 		pte[l2pte_index(va)] =
 		    L2_S_PROTO | pa | L2_S_PROT(PTE_KERNEL, VM_PROT_READ) |
 		    L2_C | L2_XSCALE_T_TEX(TEX_XSCALE_X);
 	}
 
 	/*
 	 * Configure the mini-data cache for write-back with
 	 * read/write-allocate.
 	 *
 	 * NOTE: In order to reconfigure the mini-data cache, we must
 	 * make sure it contains no valid data!  In order to do that,
 	 * we must issue a global data cache invalidate command!
 	 *
 	 * WE ASSUME WE ARE RUNNING UN-CACHED WHEN THIS ROUTINE IS CALLED!
 	 * THIS IS VERY IMPORTANT!
 	 */
 
 	/* Invalidate data and mini-data. */
 	__asm __volatile("mcr p15, 0, %0, c7, c6, 0" : : "r" (0));
 	__asm __volatile("mrc p15, 0, %0, c1, c0, 1" : "=r" (auxctl));
 	auxctl = (auxctl & ~XSCALE_AUXCTL_MD_MASK) | XSCALE_AUXCTL_MD_WB_RWA;
 	__asm __volatile("mcr p15, 0, %0, c1, c0, 1" : : "r" (auxctl));
 }
 #endif
 
 /*
  * Allocate an L1 translation table for the specified pmap.
  * This is called at pmap creation time.
  */
 static void
 pmap_alloc_l1(pmap_t pm)
 {
 	struct l1_ttable *l1;
 	u_int8_t domain;
 
 	/*
 	 * Remove the L1 at the head of the LRU list
 	 */
 	mtx_lock(&l1_lru_lock);
 	l1 = TAILQ_FIRST(&l1_lru_list);
 	TAILQ_REMOVE(&l1_lru_list, l1, l1_lru);
 
 	/*
 	 * Pick the first available domain number, and update
 	 * the link to the next number.
 	 */
 	domain = l1->l1_domain_first;
 	l1->l1_domain_first = l1->l1_domain_free[domain];
 
 	/*
 	 * If there are still free domain numbers in this L1,
 	 * put it back on the TAIL of the LRU list.
 	 */
 	if (++l1->l1_domain_use_count < PMAP_DOMAINS)
 		TAILQ_INSERT_TAIL(&l1_lru_list, l1, l1_lru);
 
 	mtx_unlock(&l1_lru_lock);
 
 	/*
 	 * Fix up the relevant bits in the pmap structure
 	 */
 	pm->pm_l1 = l1;
 	pm->pm_domain = domain + 1;
 }
 
 /*
  * Free an L1 translation table.
  * This is called at pmap destruction time.
  */
 static void
 pmap_free_l1(pmap_t pm)
 {
 	struct l1_ttable *l1 = pm->pm_l1;
 
 	mtx_lock(&l1_lru_lock);
 
 	/*
 	 * If this L1 is currently on the LRU list, remove it.
 	 */
 	if (l1->l1_domain_use_count < PMAP_DOMAINS)
 		TAILQ_REMOVE(&l1_lru_list, l1, l1_lru);
 
 	/*
 	 * Free up the domain number which was allocated to the pmap
 	 */
 	l1->l1_domain_free[pm->pm_domain - 1] = l1->l1_domain_first;
 	l1->l1_domain_first = pm->pm_domain - 1;
 	l1->l1_domain_use_count--;
 
 	/*
 	 * The L1 now must have at least 1 free domain, so add
 	 * it back to the LRU list. If the use count is zero,
 	 * put it at the head of the list, otherwise it goes
 	 * to the tail.
 	 */
 	if (l1->l1_domain_use_count == 0) {
 		TAILQ_INSERT_HEAD(&l1_lru_list, l1, l1_lru);
 	}	else
 		TAILQ_INSERT_TAIL(&l1_lru_list, l1, l1_lru);
 
 	mtx_unlock(&l1_lru_lock);
 }
 
 /*
  * Returns a pointer to the L2 bucket associated with the specified pmap
  * and VA, or NULL if no L2 bucket exists for the address.
  */
 static PMAP_INLINE struct l2_bucket *
 pmap_get_l2_bucket(pmap_t pm, vm_offset_t va)
 {
 	struct l2_dtable *l2;
 	struct l2_bucket *l2b;
 	u_short l1idx;
 
 	l1idx = L1_IDX(va);
 
 	if ((l2 = pm->pm_l2[L2_IDX(l1idx)]) == NULL ||
 	    (l2b = &l2->l2_bucket[L2_BUCKET(l1idx)])->l2b_kva == NULL)
 		return (NULL);
 
 	return (l2b);
 }
 
 /*
  * Returns a pointer to the L2 bucket associated with the specified pmap
  * and VA.
  *
  * If no L2 bucket exists, perform the necessary allocations to put an L2
  * bucket/page table in place.
  *
  * Note that if a new L2 bucket/page was allocated, the caller *must*
  * increment the bucket occupancy counter appropriately *before*
  * releasing the pmap's lock to ensure no other thread or cpu deallocates
  * the bucket/page in the meantime.
  */
 static struct l2_bucket *
 pmap_alloc_l2_bucket(pmap_t pm, vm_offset_t va)
 {
 	struct l2_dtable *l2;
 	struct l2_bucket *l2b;
 	u_short l1idx;
 
 	l1idx = L1_IDX(va);
 
 	PMAP_ASSERT_LOCKED(pm);
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	if ((l2 = pm->pm_l2[L2_IDX(l1idx)]) == NULL) {
 		/*
 		 * No mapping at this address, as there is
 		 * no entry in the L1 table.
 		 * Need to allocate a new l2_dtable.
 		 */
 		PMAP_UNLOCK(pm);
 		rw_wunlock(&pvh_global_lock);
 		if ((l2 = uma_zalloc(l2table_zone, M_NOWAIT)) == NULL) {
 			rw_wlock(&pvh_global_lock);
 			PMAP_LOCK(pm);
 			return (NULL);
 		}
 		rw_wlock(&pvh_global_lock);
 		PMAP_LOCK(pm);
 		if (pm->pm_l2[L2_IDX(l1idx)] != NULL) {
 			/*
 			 * Someone already allocated the l2_dtable while
 			 * we were doing the same.
 			 */
 			uma_zfree(l2table_zone, l2);
 			l2 = pm->pm_l2[L2_IDX(l1idx)];
 		} else {
 			bzero(l2, sizeof(*l2));
 			/*
 			 * Link it into the parent pmap
 			 */
 			pm->pm_l2[L2_IDX(l1idx)] = l2;
 		}
 	}
 
 	l2b = &l2->l2_bucket[L2_BUCKET(l1idx)];
 
 	/*
 	 * Fetch pointer to the L2 page table associated with the address.
 	 */
 	if (l2b->l2b_kva == NULL) {
 		pt_entry_t *ptep;
 
 		/*
 		 * No L2 page table has been allocated. Chances are, this
 		 * is because we just allocated the l2_dtable, above.
 		 */
 		PMAP_UNLOCK(pm);
 		rw_wunlock(&pvh_global_lock);
 		ptep = uma_zalloc(l2zone, M_NOWAIT);
 		rw_wlock(&pvh_global_lock);
 		PMAP_LOCK(pm);
 		if (l2b->l2b_kva != 0) {
 			/* We lost the race. */
 			uma_zfree(l2zone, ptep);
 			return (l2b);
 		}
 		l2b->l2b_phys = vtophys(ptep);
 		if (ptep == NULL) {
 			/*
 			 * Oops, no more L2 page tables available at this
 			 * time. We may need to deallocate the l2_dtable
 			 * if we allocated a new one above.
 			 */
 			if (l2->l2_occupancy == 0) {
 				pm->pm_l2[L2_IDX(l1idx)] = NULL;
 				uma_zfree(l2table_zone, l2);
 			}
 			return (NULL);
 		}
 
 		l2->l2_occupancy++;
 		l2b->l2b_kva = ptep;
 		l2b->l2b_l1idx = l1idx;
 	}
 
 	return (l2b);
 }
 
 static PMAP_INLINE void
 #ifndef PMAP_INCLUDE_PTE_SYNC
 pmap_free_l2_ptp(pt_entry_t *l2)
 #else
 pmap_free_l2_ptp(boolean_t need_sync, pt_entry_t *l2)
 #endif
 {
 #ifdef PMAP_INCLUDE_PTE_SYNC
 	/*
 	 * Note: With a write-back cache, we may need to sync this
 	 * L2 table before re-using it.
 	 * This is because it may have belonged to a non-current
 	 * pmap, in which case the cache syncs would have been
 	 * skipped when the pages were being unmapped. If the
 	 * L2 table were then to be immediately re-allocated to
 	 * the *current* pmap, it may well contain stale mappings
 	 * which have not yet been cleared by a cache write-back
 	 * and so would still be visible to the mmu.
 	 */
 	if (need_sync)
 		PTE_SYNC_RANGE(l2, L2_TABLE_SIZE_REAL / sizeof(pt_entry_t));
 #endif
 	uma_zfree(l2zone, l2);
 }
 /*
  * One or more mappings in the specified L2 descriptor table have just been
  * invalidated.
  *
  * Garbage collect the metadata and descriptor table itself if necessary.
  *
  * The pmap lock must be acquired when this is called (not necessary
  * for the kernel pmap).
  */
 static void
 pmap_free_l2_bucket(pmap_t pm, struct l2_bucket *l2b, u_int count)
 {
 	struct l2_dtable *l2;
 	pd_entry_t *pl1pd, l1pd;
 	pt_entry_t *ptep;
 	u_short l1idx;
 
 
 	/*
 	 * Update the bucket's reference count according to how many
 	 * PTEs the caller has just invalidated.
 	 */
 	l2b->l2b_occupancy -= count;
 
 	/*
 	 * Note:
 	 *
 	 * Level 2 page tables allocated to the kernel pmap are never freed
 	 * as that would require checking all Level 1 page tables and
 	 * removing any references to the Level 2 page table. See also the
 	 * comment elsewhere about never freeing bootstrap L2 descriptors.
 	 *
 	 * We make do with just invalidating the mapping in the L2 table.
 	 *
 	 * This isn't really a big deal in practice and, in fact, leads
 	 * to a performance win over time as we don't need to continually
 	 * alloc/free.
 	 */
 	if (l2b->l2b_occupancy > 0 || pm == pmap_kernel())
 		return;
 
 	/*
 	 * There are no more valid mappings in this level 2 page table.
 	 * Go ahead and NULL-out the pointer in the bucket, then
 	 * free the page table.
 	 */
 	l1idx = l2b->l2b_l1idx;
 	ptep = l2b->l2b_kva;
 	l2b->l2b_kva = NULL;
 
 	pl1pd = &pm->pm_l1->l1_kva[l1idx];
 
 	/*
 	 * If the L1 slot matches the pmap's domain
 	 * number, then invalidate it.
 	 */
 	l1pd = *pl1pd & (L1_TYPE_MASK | L1_C_DOM_MASK);
 	if (l1pd == (L1_C_DOM(pm->pm_domain) | L1_TYPE_C)) {
 		*pl1pd = 0;
 		PTE_SYNC(pl1pd);
 	}
 
 	/*
 	 * Release the L2 descriptor table back to the pool cache.
 	 */
 #ifndef PMAP_INCLUDE_PTE_SYNC
 	pmap_free_l2_ptp(ptep);
 #else
 	pmap_free_l2_ptp(!pmap_is_current(pm), ptep);
 #endif
 
 	/*
 	 * Update the reference count in the associated l2_dtable
 	 */
 	l2 = pm->pm_l2[L2_IDX(l1idx)];
 	if (--l2->l2_occupancy > 0)
 		return;
 
 	/*
 	 * There are no more valid mappings in any of the Level 1
 	 * slots managed by this l2_dtable. Go ahead and NULL-out
 	 * the pointer in the parent pmap and free the l2_dtable.
 	 */
 	pm->pm_l2[L2_IDX(l1idx)] = NULL;
 	uma_zfree(l2table_zone, l2);
 }
 
 /*
  * Pool cache constructors for L2 descriptor tables, metadata and pmap
  * structures.
  */
 static int
 pmap_l2ptp_ctor(void *mem, int size, void *arg, int flags)
 {
 #ifndef PMAP_INCLUDE_PTE_SYNC
 	struct l2_bucket *l2b;
 	pt_entry_t *ptep, pte;
 #ifdef ARM_USE_SMALL_ALLOC
 	pd_entry_t *pde;
 #endif
 	vm_offset_t va = (vm_offset_t)mem & ~PAGE_MASK;
 
 	/*
 	 * The mappings for these page tables were initially made using
 	 * pmap_kenter() by the pool subsystem. Therefore, the cache-
 	 * mode will not be right for page table mappings. To avoid
 	 * polluting the pmap_kenter() code with a special case for
 	 * page tables, we simply fix up the cache-mode here if it's not
 	 * correct.
 	 */
 #ifdef ARM_USE_SMALL_ALLOC
 	pde = &kernel_pmap->pm_l1->l1_kva[L1_IDX(va)];
 	if (!l1pte_section_p(*pde)) {
 #endif
 		l2b = pmap_get_l2_bucket(pmap_kernel(), va);
 		ptep = &l2b->l2b_kva[l2pte_index(va)];
 		pte = *ptep;
 		
 		if ((pte & L2_S_CACHE_MASK) != pte_l2_s_cache_mode_pt) {
 			/*
 			 * Page tables must have the cache-mode set to
 			 * Write-Thru.
 			 */
 			*ptep = (pte & ~L2_S_CACHE_MASK) | pte_l2_s_cache_mode_pt;
 			PTE_SYNC(ptep);
 			cpu_tlb_flushD_SE(va);
 			cpu_cpwait();
 		}
 #ifdef ARM_USE_SMALL_ALLOC
 	}
 #endif
 #endif
 	memset(mem, 0, L2_TABLE_SIZE_REAL);
 	PTE_SYNC_RANGE(mem, L2_TABLE_SIZE_REAL / sizeof(pt_entry_t));
 	return (0);
 }
 
 /*
  * A bunch of routines to conditionally flush the caches/TLB depending
  * on whether the specified pmap actually needs to be flushed at any
  * given time.
  */
 static PMAP_INLINE void
 pmap_tlb_flushID_SE(pmap_t pm, vm_offset_t va)
 {
 
 	if (pmap_is_current(pm))
 		cpu_tlb_flushID_SE(va);
 }
 
 static PMAP_INLINE void
 pmap_tlb_flushD_SE(pmap_t pm, vm_offset_t va)
 {
 
 	if (pmap_is_current(pm))
 		cpu_tlb_flushD_SE(va);
 }
 
 static PMAP_INLINE void
 pmap_tlb_flushID(pmap_t pm)
 {
 
 	if (pmap_is_current(pm))
 		cpu_tlb_flushID();
 }
 static PMAP_INLINE void
 pmap_tlb_flushD(pmap_t pm)
 {
 
 	if (pmap_is_current(pm))
 		cpu_tlb_flushD();
 }
 
 static int
 pmap_has_valid_mapping(pmap_t pm, vm_offset_t va)
 {
 	pd_entry_t *pde;
 	pt_entry_t *ptep;
 
 	if (pmap_get_pde_pte(pm, va, &pde, &ptep) &&
 	    ptep && ((*ptep & L2_TYPE_MASK) != L2_TYPE_INV))
 		return (1);
 
 	return (0);
 }
 
 static PMAP_INLINE void
 pmap_idcache_wbinv_range(pmap_t pm, vm_offset_t va, vm_size_t len)
 {
 	vm_size_t rest;
 
 	CTR4(KTR_PMAP, "pmap_dcache_wbinv_range: pmap %p is_kernel %d va 0x%08x"
 	    " len 0x%x ", pm, pm == pmap_kernel(), va, len);
 
 	if (pmap_is_current(pm) || pm == pmap_kernel()) {
 		rest = MIN(PAGE_SIZE - (va & PAGE_MASK), len);
 		while (len > 0) {
 			if (pmap_has_valid_mapping(pm, va)) {
 				cpu_idcache_wbinv_range(va, rest);
 				cpu_l2cache_wbinv_range(va, rest);
 			}
 			len -= rest;
 			va += rest;
 			rest = MIN(PAGE_SIZE, len);
 		}
 	}
 }
 
 static PMAP_INLINE void
 pmap_dcache_wb_range(pmap_t pm, vm_offset_t va, vm_size_t len, boolean_t do_inv,
     boolean_t rd_only)
 {
 	vm_size_t rest;
 
 	CTR4(KTR_PMAP, "pmap_dcache_wb_range: pmap %p is_kernel %d va 0x%08x "
 	    "len 0x%x ", pm, pm == pmap_kernel(), va, len);
 	CTR2(KTR_PMAP, " do_inv %d rd_only %d", do_inv, rd_only);
 
 	if (pmap_is_current(pm)) {
 		rest = MIN(PAGE_SIZE - (va & PAGE_MASK), len);
 		while (len > 0) {
 			if (pmap_has_valid_mapping(pm, va)) {
 				if (do_inv && rd_only) {
 					cpu_dcache_inv_range(va, rest);
 					cpu_l2cache_inv_range(va, rest);
 				} else if (do_inv) {
 					cpu_dcache_wbinv_range(va, rest);
 					cpu_l2cache_wbinv_range(va, rest);
 				} else if (!rd_only) {
 					cpu_dcache_wb_range(va, rest);
 					cpu_l2cache_wb_range(va, rest);
 				}
 			}
 			len -= rest;
 			va += rest;
 
 			rest = MIN(PAGE_SIZE, len);
 		}
 	}
 }
 
 static PMAP_INLINE void
 pmap_idcache_wbinv_all(pmap_t pm)
 {
 
 	if (pmap_is_current(pm)) {
 		cpu_idcache_wbinv_all();
 		cpu_l2cache_wbinv_all();
 	}
 }
 
 #ifdef notyet
 static PMAP_INLINE void
 pmap_dcache_wbinv_all(pmap_t pm)
 {
 
 	if (pmap_is_current(pm)) {
 		cpu_dcache_wbinv_all();
 		cpu_l2cache_wbinv_all();
 	}
 }
 #endif
 
 /*
  * PTE_SYNC_CURRENT:
  *
  *     Make sure the pte is written out to RAM.
  *     We need to do this for one of two cases:
  *       - We're dealing with the kernel pmap
  *       - There is no pmap active in the cache/tlb.
  *       - The specified pmap is 'active' in the cache/tlb.
  */
 #ifdef PMAP_INCLUDE_PTE_SYNC
 #define	PTE_SYNC_CURRENT(pm, ptep)	\
 do {					\
 	if (PMAP_NEEDS_PTE_SYNC && 	\
 	    pmap_is_current(pm))	\
 		PTE_SYNC(ptep);		\
 } while (/*CONSTCOND*/0)
 #else
 #define	PTE_SYNC_CURRENT(pm, ptep)	/* nothing */
 #endif
 
 /*
  * cacheable == -1 means we must make the entry uncacheable, 1 means
  * cacheable;
  */
 static __inline void
 pmap_set_cache_entry(pv_entry_t pv, pmap_t pm, vm_offset_t va, int cacheable)
 {
 	struct l2_bucket *l2b;
 	pt_entry_t *ptep, pte;
 
 	l2b = pmap_get_l2_bucket(pv->pv_pmap, pv->pv_va);
 	ptep = &l2b->l2b_kva[l2pte_index(pv->pv_va)];
 
 	if (cacheable == 1) {
 		pte = (*ptep & ~L2_S_CACHE_MASK) | pte_l2_s_cache_mode;
 		if (l2pte_valid(pte)) {
 			if (PV_BEEN_EXECD(pv->pv_flags)) {
 				pmap_tlb_flushID_SE(pv->pv_pmap, pv->pv_va);
 			} else if (PV_BEEN_REFD(pv->pv_flags)) {
 				pmap_tlb_flushD_SE(pv->pv_pmap, pv->pv_va);
 			}
 		}
 	} else {
 		pte = *ptep &~ L2_S_CACHE_MASK;
 		if ((va != pv->pv_va || pm != pv->pv_pmap) &&
 			    l2pte_valid(pte)) {
 			if (PV_BEEN_EXECD(pv->pv_flags)) {
 				pmap_idcache_wbinv_range(pv->pv_pmap,
 					    pv->pv_va, PAGE_SIZE);
 				pmap_tlb_flushID_SE(pv->pv_pmap, pv->pv_va);
 			} else if (PV_BEEN_REFD(pv->pv_flags)) {
 				pmap_dcache_wb_range(pv->pv_pmap,
 					    pv->pv_va, PAGE_SIZE, TRUE,
 					    (pv->pv_flags & PVF_WRITE) == 0);
 				pmap_tlb_flushD_SE(pv->pv_pmap,
 					    pv->pv_va);
 			}
 		}
 	}
 	*ptep = pte;
 	PTE_SYNC_CURRENT(pv->pv_pmap, ptep);
 }
 
 static void
 pmap_fix_cache(struct vm_page *pg, pmap_t pm, vm_offset_t va)
 {
 	int pmwc = 0;
 	int writable = 0, kwritable = 0, uwritable = 0;
 	int entries = 0, kentries = 0, uentries = 0;
 	struct pv_entry *pv;
 
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 
 	/* the cache gets written back/invalidated on context switch.
 	 * therefore, if a user page shares an entry in the same page or
 	 * with the kernel map and at least one is writable, then the
 	 * cache entry must be set write-through.
 	 */
 
 	TAILQ_FOREACH(pv, &pg->md.pv_list, pv_list) {
 			/* generate a count of the pv_entry uses */
 		if (pv->pv_flags & PVF_WRITE) {
 			if (pv->pv_pmap == pmap_kernel())
 				kwritable++;
 			else if (pv->pv_pmap == pm)
 				uwritable++;
 			writable++;
 		}
 		if (pv->pv_pmap == pmap_kernel())
 			kentries++;
 		else {
 			if (pv->pv_pmap == pm)
 				uentries++;
 			entries++;
 		}
 	}
 		/*
 		 * check if the user duplicate mapping has
 		 * been removed.
 		 */
 	if ((pm != pmap_kernel()) && (((uentries > 1) && uwritable) ||
 	    (uwritable > 1)))
 			pmwc = 1;
 
 	TAILQ_FOREACH(pv, &pg->md.pv_list, pv_list) {
 		/* check for user uncachable conditions - order is important */
 		if (pm != pmap_kernel() &&
 		    (pv->pv_pmap == pm || pv->pv_pmap == pmap_kernel())) {
 
 			if ((uentries > 1 && uwritable) || uwritable > 1) {
 
 				/* user duplicate mapping */
 				if (pv->pv_pmap != pmap_kernel())
 					pv->pv_flags |= PVF_MWC;
 
 				if (!(pv->pv_flags & PVF_NC)) {
 					pv->pv_flags |= PVF_NC;
 					pmap_set_cache_entry(pv, pm, va, -1);
 				}
 				continue;
 			} else	/* no longer a duplicate user */
 				pv->pv_flags &= ~PVF_MWC;
 		}
 
 		/*
 		 * check for kernel uncachable conditions
 		 * kernel writable or kernel readable with writable user entry
 		 */
 		if ((kwritable && (entries || kentries > 1)) ||
 		    (kwritable > 1) ||
 		    ((kwritable != writable) && kentries &&
 		     (pv->pv_pmap == pmap_kernel() ||
 		      (pv->pv_flags & PVF_WRITE) ||
 		      (pv->pv_flags & PVF_MWC)))) {
 
 			if (!(pv->pv_flags & PVF_NC)) {
 				pv->pv_flags |= PVF_NC;
 				pmap_set_cache_entry(pv, pm, va, -1);
 			}
 			continue;
 		}
 
 			/* kernel and user are cachable */
 		if ((pm == pmap_kernel()) && !(pv->pv_flags & PVF_MWC) &&
 		    (pv->pv_flags & PVF_NC)) {
 
 			pv->pv_flags &= ~PVF_NC;
 			if (pg->md.pv_memattr != VM_MEMATTR_UNCACHEABLE)
 				pmap_set_cache_entry(pv, pm, va, 1);
 			continue;
 		}
 			/* user is no longer sharable and writable */
 		if (pm != pmap_kernel() &&
 		    (pv->pv_pmap == pm || pv->pv_pmap == pmap_kernel()) &&
 		    !pmwc && (pv->pv_flags & PVF_NC)) {
 
 			pv->pv_flags &= ~(PVF_NC | PVF_MWC);
 			if (pg->md.pv_memattr != VM_MEMATTR_UNCACHEABLE)
 				pmap_set_cache_entry(pv, pm, va, 1);
 		}
 	}
 
 	if ((kwritable == 0) && (writable == 0)) {
 		pg->md.pvh_attrs &= ~PVF_MOD;
 		vm_page_aflag_clear(pg, PGA_WRITEABLE);
 		return;
 	}
 }
 
 /*
  * Modify pte bits for all ptes corresponding to the given physical address.
  * We use `maskbits' rather than `clearbits' because we're always passing
  * constants and the latter would require an extra inversion at run-time.
  */
 static int
 pmap_clearbit(struct vm_page *pg, u_int maskbits)
 {
 	struct l2_bucket *l2b;
 	struct pv_entry *pv;
 	pt_entry_t *ptep, npte, opte;
 	pmap_t pm;
 	vm_offset_t va;
 	u_int oflags;
 	int count = 0;
 
 	rw_wlock(&pvh_global_lock);
 
 	if (maskbits & PVF_WRITE)
 		maskbits |= PVF_MOD;
 	/*
 	 * Clear saved attributes (modify, reference)
 	 */
 	pg->md.pvh_attrs &= ~(maskbits & (PVF_MOD | PVF_REF));
 
 	if (TAILQ_EMPTY(&pg->md.pv_list)) {
 		rw_wunlock(&pvh_global_lock);
 		return (0);
 	}
 
 	/*
 	 * Loop over all current mappings setting/clearing as appropos
 	 */
 	TAILQ_FOREACH(pv, &pg->md.pv_list, pv_list) {
 		va = pv->pv_va;
 		pm = pv->pv_pmap;
 		oflags = pv->pv_flags;
 
 		if (!(oflags & maskbits)) {
 			if ((maskbits & PVF_WRITE) && (pv->pv_flags & PVF_NC)) {
 				if (pg->md.pv_memattr != 
 				    VM_MEMATTR_UNCACHEABLE) {
 					PMAP_LOCK(pm);
 					l2b = pmap_get_l2_bucket(pm, va);
 					ptep = &l2b->l2b_kva[l2pte_index(va)];
 					*ptep |= pte_l2_s_cache_mode;
 					PTE_SYNC(ptep);
 					PMAP_UNLOCK(pm);
 				}
 				pv->pv_flags &= ~(PVF_NC | PVF_MWC);
 			}
 			continue;
 		}
 		pv->pv_flags &= ~maskbits;
 
 		PMAP_LOCK(pm);
 
 		l2b = pmap_get_l2_bucket(pm, va);
 
 		ptep = &l2b->l2b_kva[l2pte_index(va)];
 		npte = opte = *ptep;
 
 		if (maskbits & (PVF_WRITE|PVF_MOD)) {
 			if ((pv->pv_flags & PVF_NC)) {
 				/*
 				 * Entry is not cacheable:
 				 *
 				 * Don't turn caching on again if this is a
 				 * modified emulation. This would be
 				 * inconsitent with the settings created by
 				 * pmap_fix_cache(). Otherwise, it's safe
 				 * to re-enable cacheing.
 				 *
 				 * There's no need to call pmap_fix_cache()
 				 * here: all pages are losing their write
 				 * permission.
 				 */
 				if (maskbits & PVF_WRITE) {
 					if (pg->md.pv_memattr !=
 					    VM_MEMATTR_UNCACHEABLE)
 						npte |= pte_l2_s_cache_mode;
 					pv->pv_flags &= ~(PVF_NC | PVF_MWC);
 				}
 			} else
 			if (opte & L2_S_PROT_W) {
 				vm_page_dirty(pg);
 				/*
 				 * Entry is writable/cacheable: check if pmap
 				 * is current if it is flush it, otherwise it
 				 * won't be in the cache
 				 */
 				if (PV_BEEN_EXECD(oflags))
 					pmap_idcache_wbinv_range(pm, pv->pv_va,
 					    PAGE_SIZE);
 				else
 				if (PV_BEEN_REFD(oflags))
 					pmap_dcache_wb_range(pm, pv->pv_va,
 					    PAGE_SIZE,
 					    (maskbits & PVF_REF) ? TRUE : FALSE,
 					    FALSE);
 			}
 
 			/* make the pte read only */
 			npte &= ~L2_S_PROT_W;
 		}
 
 		if (maskbits & PVF_REF) {
 			if ((pv->pv_flags & PVF_NC) == 0 &&
 			    (maskbits & (PVF_WRITE|PVF_MOD)) == 0) {
 				/*
 				 * Check npte here; we may have already
 				 * done the wbinv above, and the validity
 				 * of the PTE is the same for opte and
 				 * npte.
 				 */
 				if (npte & L2_S_PROT_W) {
 					if (PV_BEEN_EXECD(oflags))
 						pmap_idcache_wbinv_range(pm,
 						    pv->pv_va, PAGE_SIZE);
 					else
 					if (PV_BEEN_REFD(oflags))
 						pmap_dcache_wb_range(pm,
 						    pv->pv_va, PAGE_SIZE,
 						    TRUE, FALSE);
 				} else
 				if ((npte & L2_TYPE_MASK) != L2_TYPE_INV) {
 					/* XXXJRT need idcache_inv_range */
 					if (PV_BEEN_EXECD(oflags))
 						pmap_idcache_wbinv_range(pm,
 						    pv->pv_va, PAGE_SIZE);
 					else
 					if (PV_BEEN_REFD(oflags))
 						pmap_dcache_wb_range(pm,
 						    pv->pv_va, PAGE_SIZE,
 						    TRUE, TRUE);
 				}
 			}
 
 			/*
 			 * Make the PTE invalid so that we will take a
 			 * page fault the next time the mapping is
 			 * referenced.
 			 */
 			npte &= ~L2_TYPE_MASK;
 			npte |= L2_TYPE_INV;
 		}
 
 		if (npte != opte) {
 			count++;
 			*ptep = npte;
 			PTE_SYNC(ptep);
 			/* Flush the TLB entry if a current pmap. */
 			if (PV_BEEN_EXECD(oflags))
 				pmap_tlb_flushID_SE(pm, pv->pv_va);
 			else
 			if (PV_BEEN_REFD(oflags))
 				pmap_tlb_flushD_SE(pm, pv->pv_va);
 		}
 
 		PMAP_UNLOCK(pm);
 
 	}
 
 	if (maskbits & PVF_WRITE)
 		vm_page_aflag_clear(pg, PGA_WRITEABLE);
 	rw_wunlock(&pvh_global_lock);
 	return (count);
 }
 
 /*
  * main pv_entry manipulation functions:
  *   pmap_enter_pv: enter a mapping onto a vm_page list
  *   pmap_remove_pv: remove a mappiing from a vm_page list
  *
  * NOTE: pmap_enter_pv expects to lock the pvh itself
  *       pmap_remove_pv expects the caller to lock the pvh before calling
  */
 
 /*
  * pmap_enter_pv: enter a mapping onto a vm_page's PV list
  *
  * => caller should hold the proper lock on pvh_global_lock
  * => caller should have pmap locked
  * => we will (someday) gain the lock on the vm_page's PV list
  * => caller should adjust ptp's wire_count before calling
  * => caller should not adjust pmap's wire_count
  */
 static void
 pmap_enter_pv(struct vm_page *pg, struct pv_entry *pve, pmap_t pm,
     vm_offset_t va, u_int flags)
 {
 
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	PMAP_ASSERT_LOCKED(pm);
 	if (pg->md.pv_kva != 0) {
 		pve->pv_pmap = kernel_pmap;
 		pve->pv_va = pg->md.pv_kva;
 		pve->pv_flags = PVF_WRITE | PVF_UNMAN;
 		if (pm != kernel_pmap)
 			PMAP_LOCK(kernel_pmap);
 		TAILQ_INSERT_HEAD(&pg->md.pv_list, pve, pv_list);
 		TAILQ_INSERT_HEAD(&kernel_pmap->pm_pvlist, pve, pv_plist);
 		if (pm != kernel_pmap)
 			PMAP_UNLOCK(kernel_pmap);
 		pg->md.pv_kva = 0;
 		if ((pve = pmap_get_pv_entry()) == NULL)
 			panic("pmap_kenter_pv: no pv entries");
 	}
 	pve->pv_pmap = pm;
 	pve->pv_va = va;
 	pve->pv_flags = flags;
 	TAILQ_INSERT_HEAD(&pg->md.pv_list, pve, pv_list);
 	TAILQ_INSERT_HEAD(&pm->pm_pvlist, pve, pv_plist);
 	pg->md.pvh_attrs |= flags & (PVF_REF | PVF_MOD);
 	if (pve->pv_flags & PVF_WIRED)
 		++pm->pm_stats.wired_count;
 	vm_page_aflag_set(pg, PGA_REFERENCED);
 }
 
 /*
  *
  * pmap_find_pv: Find a pv entry
  *
  * => caller should hold lock on vm_page
  */
 static PMAP_INLINE struct pv_entry *
 pmap_find_pv(struct vm_page *pg, pmap_t pm, vm_offset_t va)
 {
 	struct pv_entry *pv;
 
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	TAILQ_FOREACH(pv, &pg->md.pv_list, pv_list)
 	    if (pm == pv->pv_pmap && va == pv->pv_va)
 		    break;
 	return (pv);
 }
 
 /*
  * vector_page_setprot:
  *
  *	Manipulate the protection of the vector page.
  */
 void
 vector_page_setprot(int prot)
 {
 	struct l2_bucket *l2b;
 	pt_entry_t *ptep;
 
 	l2b = pmap_get_l2_bucket(pmap_kernel(), vector_page);
 
 	ptep = &l2b->l2b_kva[l2pte_index(vector_page)];
 
 	*ptep = (*ptep & ~L1_S_PROT_MASK) | L2_S_PROT(PTE_KERNEL, prot);
 	PTE_SYNC(ptep);
 	cpu_tlb_flushD_SE(vector_page);
 	cpu_cpwait();
 }
 
 /*
  * pmap_remove_pv: try to remove a mapping from a pv_list
  *
  * => caller should hold proper lock on pmap_main_lock
  * => pmap should be locked
  * => caller should hold lock on vm_page [so that attrs can be adjusted]
  * => caller should adjust ptp's wire_count and free PTP if needed
  * => caller should NOT adjust pmap's wire_count
  * => we return the removed pve
  */
 
 static void
 pmap_nuke_pv(struct vm_page *pg, pmap_t pm, struct pv_entry *pve)
 {
 
 	struct pv_entry *pv;
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	PMAP_ASSERT_LOCKED(pm);
 	TAILQ_REMOVE(&pg->md.pv_list, pve, pv_list);
 	TAILQ_REMOVE(&pm->pm_pvlist, pve, pv_plist);
 	if (pve->pv_flags & PVF_WIRED)
 		--pm->pm_stats.wired_count;
 	if (pg->md.pvh_attrs & PVF_MOD)
 		vm_page_dirty(pg);
 	if (TAILQ_FIRST(&pg->md.pv_list) == NULL)
 		pg->md.pvh_attrs &= ~PVF_REF;
        	else
 		vm_page_aflag_set(pg, PGA_REFERENCED);
 	if ((pve->pv_flags & PVF_NC) && ((pm == pmap_kernel()) ||
 	     (pve->pv_flags & PVF_WRITE) || !(pve->pv_flags & PVF_MWC)))
 		pmap_fix_cache(pg, pm, 0);
 	else if (pve->pv_flags & PVF_WRITE) {
 		TAILQ_FOREACH(pve, &pg->md.pv_list, pv_list)
 		    if (pve->pv_flags & PVF_WRITE)
 			    break;
 		if (!pve) {
 			pg->md.pvh_attrs &= ~PVF_MOD;
 			vm_page_aflag_clear(pg, PGA_WRITEABLE);
 		}
 	}
 	pv = TAILQ_FIRST(&pg->md.pv_list);
 	if (pv != NULL && (pv->pv_flags & PVF_UNMAN) &&
 	    TAILQ_NEXT(pv, pv_list) == NULL) {
 		pm = kernel_pmap;
 		pg->md.pv_kva = pv->pv_va;
 			/* a recursive pmap_nuke_pv */
 		TAILQ_REMOVE(&pg->md.pv_list, pv, pv_list);
 		TAILQ_REMOVE(&pm->pm_pvlist, pv, pv_plist);
 		if (pv->pv_flags & PVF_WIRED)
 			--pm->pm_stats.wired_count;
 		pg->md.pvh_attrs &= ~PVF_REF;
 		pg->md.pvh_attrs &= ~PVF_MOD;
 		vm_page_aflag_clear(pg, PGA_WRITEABLE);
 		pmap_free_pv_entry(pv);
 	}
 }
 
 static struct pv_entry *
 pmap_remove_pv(struct vm_page *pg, pmap_t pm, vm_offset_t va)
 {
 	struct pv_entry *pve;
 
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	pve = TAILQ_FIRST(&pg->md.pv_list);
 
 	while (pve) {
 		if (pve->pv_pmap == pm && pve->pv_va == va) {	/* match? */
 			pmap_nuke_pv(pg, pm, pve);
 			break;
 		}
 		pve = TAILQ_NEXT(pve, pv_list);
 	}
 
 	if (pve == NULL && pg->md.pv_kva == va)
 		pg->md.pv_kva = 0;
 
 	return(pve);				/* return removed pve */
 }
 /*
  *
  * pmap_modify_pv: Update pv flags
  *
  * => caller should hold lock on vm_page [so that attrs can be adjusted]
  * => caller should NOT adjust pmap's wire_count
  * => we return the old flags
  *
  * Modify a physical-virtual mapping in the pv table
  */
 static u_int
 pmap_modify_pv(struct vm_page *pg, pmap_t pm, vm_offset_t va,
     u_int clr_mask, u_int set_mask)
 {
 	struct pv_entry *npv;
 	u_int flags, oflags;
 
 	PMAP_ASSERT_LOCKED(pm);
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	if ((npv = pmap_find_pv(pg, pm, va)) == NULL)
 		return (0);
 
 	/*
 	 * There is at least one VA mapping this page.
 	 */
 
 	if (clr_mask & (PVF_REF | PVF_MOD))
 		pg->md.pvh_attrs |= set_mask & (PVF_REF | PVF_MOD);
 
 	oflags = npv->pv_flags;
 	npv->pv_flags = flags = (oflags & ~clr_mask) | set_mask;
 
 	if ((flags ^ oflags) & PVF_WIRED) {
 		if (flags & PVF_WIRED)
 			++pm->pm_stats.wired_count;
 		else
 			--pm->pm_stats.wired_count;
 	}
 
 	if ((flags ^ oflags) & PVF_WRITE)
 		pmap_fix_cache(pg, pm, 0);
 
 	return (oflags);
 }
 
 /* Function to set the debug level of the pmap code */
 #ifdef PMAP_DEBUG
 void
 pmap_debug(int level)
 {
 	pmap_debug_level = level;
 	dprintf("pmap_debug: level=%d\n", pmap_debug_level);
 }
 #endif  /* PMAP_DEBUG */
 
 void
 pmap_pinit0(struct pmap *pmap)
 {
 	PDEBUG(1, printf("pmap_pinit0: pmap = %08x\n", (u_int32_t) pmap));
 
 	dprintf("pmap_pinit0: pmap = %08x, pm_pdir = %08x\n",
 		(u_int32_t) pmap, (u_int32_t) pmap->pm_pdir);
 	bcopy(kernel_pmap, pmap, sizeof(*pmap));
 	bzero(&pmap->pm_mtx, sizeof(pmap->pm_mtx));
 	PMAP_LOCK_INIT(pmap);
 }
 
 /*
  *	Initialize a vm_page's machine-dependent fields.
  */
 void
 pmap_page_init(vm_page_t m)
 {
 
 	TAILQ_INIT(&m->md.pv_list);
 	m->md.pv_memattr = VM_MEMATTR_DEFAULT;
 }
 
 /*
  *      Initialize the pmap module.
  *      Called by vm_init, to initialize any structures that the pmap
  *      system needs to map virtual memory.
  */
 void
 pmap_init(void)
 {
 	int shpgperproc = PMAP_SHPGPERPROC;
 
 	PDEBUG(1, printf("pmap_init: phys_start = %08x\n", PHYSADDR));
 
 	l2zone = uma_zcreate("L2 Table", L2_TABLE_SIZE_REAL, pmap_l2ptp_ctor,
 	    NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE);
 	l2table_zone = uma_zcreate("L2 Table", sizeof(struct l2_dtable), NULL,
 	    NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE);
 
 	/*
 	 * Initialize the PV entry allocator.
 	 */
 	pvzone = uma_zcreate("PV ENTRY", sizeof (struct pv_entry), NULL, NULL,
 	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE);
 	TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
 	pv_entry_max = shpgperproc * maxproc + cnt.v_page_count;
-	uma_zone_set_obj(pvzone, &pvzone_obj, pv_entry_max);
+	uma_zone_reserve_kva(pvzone, pv_entry_max);
 	pv_entry_high_water = 9 * (pv_entry_max / 10);
 
 	/*
 	 * Now it is safe to enable pv_table recording.
 	 */
 	PDEBUG(1, printf("pmap_init: done!\n"));
 }
 
 int
 pmap_fault_fixup(pmap_t pm, vm_offset_t va, vm_prot_t ftype, int user)
 {
 	struct l2_dtable *l2;
 	struct l2_bucket *l2b;
 	pd_entry_t *pl1pd, l1pd;
 	pt_entry_t *ptep, pte;
 	vm_paddr_t pa;
 	u_int l1idx;
 	int rv = 0;
 
 	l1idx = L1_IDX(va);
 	rw_wlock(&pvh_global_lock);
 	PMAP_LOCK(pm);
 
 	/*
 	 * If there is no l2_dtable for this address, then the process
 	 * has no business accessing it.
 	 *
 	 * Note: This will catch userland processes trying to access
 	 * kernel addresses.
 	 */
 	l2 = pm->pm_l2[L2_IDX(l1idx)];
 	if (l2 == NULL)
 		goto out;
 
 	/*
 	 * Likewise if there is no L2 descriptor table
 	 */
 	l2b = &l2->l2_bucket[L2_BUCKET(l1idx)];
 	if (l2b->l2b_kva == NULL)
 		goto out;
 
 	/*
 	 * Check the PTE itself.
 	 */
 	ptep = &l2b->l2b_kva[l2pte_index(va)];
 	pte = *ptep;
 	if (pte == 0)
 		goto out;
 
 	/*
 	 * Catch a userland access to the vector page mapped at 0x0
 	 */
 	if (user && (pte & L2_S_PROT_U) == 0)
 		goto out;
 	if (va == vector_page)
 		goto out;
 
 	pa = l2pte_pa(pte);
 
 	if ((ftype & VM_PROT_WRITE) && (pte & L2_S_PROT_W) == 0) {
 		/*
 		 * This looks like a good candidate for "page modified"
 		 * emulation...
 		 */
 		struct pv_entry *pv;
 		struct vm_page *pg;
 
 		/* Extract the physical address of the page */
 		if ((pg = PHYS_TO_VM_PAGE(pa)) == NULL) {
 			goto out;
 		}
 		/* Get the current flags for this page. */
 
 		pv = pmap_find_pv(pg, pm, va);
 		if (pv == NULL) {
 			goto out;
 		}
 
 		/*
 		 * Do the flags say this page is writable? If not then it
 		 * is a genuine write fault. If yes then the write fault is
 		 * our fault as we did not reflect the write access in the
 		 * PTE. Now we know a write has occurred we can correct this
 		 * and also set the modified bit
 		 */
 		if ((pv->pv_flags & PVF_WRITE) == 0) {
 			goto out;
 		}
 
 		pg->md.pvh_attrs |= PVF_REF | PVF_MOD;
 		vm_page_dirty(pg);
 		pv->pv_flags |= PVF_REF | PVF_MOD;
 
 		/*
 		 * Re-enable write permissions for the page.  No need to call
 		 * pmap_fix_cache(), since this is just a
 		 * modified-emulation fault, and the PVF_WRITE bit isn't
 		 * changing. We've already set the cacheable bits based on
 		 * the assumption that we can write to this page.
 		 */
 		*ptep = (pte & ~L2_TYPE_MASK) | L2_S_PROTO | L2_S_PROT_W;
 		PTE_SYNC(ptep);
 		rv = 1;
 	} else
 	if ((pte & L2_TYPE_MASK) == L2_TYPE_INV) {
 		/*
 		 * This looks like a good candidate for "page referenced"
 		 * emulation.
 		 */
 		struct pv_entry *pv;
 		struct vm_page *pg;
 
 		/* Extract the physical address of the page */
 		if ((pg = PHYS_TO_VM_PAGE(pa)) == NULL)
 			goto out;
 		/* Get the current flags for this page. */
 
 		pv = pmap_find_pv(pg, pm, va);
 		if (pv == NULL)
 			goto out;
 
 		pg->md.pvh_attrs |= PVF_REF;
 		pv->pv_flags |= PVF_REF;
 
 
 		*ptep = (pte & ~L2_TYPE_MASK) | L2_S_PROTO;
 		PTE_SYNC(ptep);
 		rv = 1;
 	}
 
 	/*
 	 * We know there is a valid mapping here, so simply
 	 * fix up the L1 if necessary.
 	 */
 	pl1pd = &pm->pm_l1->l1_kva[l1idx];
 	l1pd = l2b->l2b_phys | L1_C_DOM(pm->pm_domain) | L1_C_PROTO;
 	if (*pl1pd != l1pd) {
 		*pl1pd = l1pd;
 		PTE_SYNC(pl1pd);
 		rv = 1;
 	}
 
 #ifdef CPU_SA110
 	/*
 	 * There are bugs in the rev K SA110.  This is a check for one
 	 * of them.
 	 */
 	if (rv == 0 && curcpu()->ci_arm_cputype == CPU_ID_SA110 &&
 	    curcpu()->ci_arm_cpurev < 3) {
 		/* Always current pmap */
 		if (l2pte_valid(pte)) {
 			extern int kernel_debug;
 			if (kernel_debug & 1) {
 				struct proc *p = curlwp->l_proc;
 				printf("prefetch_abort: page is already "
 				    "mapped - pte=%p *pte=%08x\n", ptep, pte);
 				printf("prefetch_abort: pc=%08lx proc=%p "
 				    "process=%s\n", va, p, p->p_comm);
 				printf("prefetch_abort: far=%08x fs=%x\n",
 				    cpu_faultaddress(), cpu_faultstatus());
 			}
 #ifdef DDB
 			if (kernel_debug & 2)
 				Debugger();
 #endif
 			rv = 1;
 		}
 	}
 #endif /* CPU_SA110 */
 
 #ifdef DEBUG
 	/*
 	 * If 'rv == 0' at this point, it generally indicates that there is a
 	 * stale TLB entry for the faulting address. This happens when two or
 	 * more processes are sharing an L1. Since we don't flush the TLB on
 	 * a context switch between such processes, we can take domain faults
 	 * for mappings which exist at the same VA in both processes. EVEN IF
 	 * WE'VE RECENTLY FIXED UP THE CORRESPONDING L1 in pmap_enter(), for
 	 * example.
 	 *
 	 * This is extremely likely to happen if pmap_enter() updated the L1
 	 * entry for a recently entered mapping. In this case, the TLB is
 	 * flushed for the new mapping, but there may still be TLB entries for
 	 * other mappings belonging to other processes in the 1MB range
 	 * covered by the L1 entry.
 	 *
 	 * Since 'rv == 0', we know that the L1 already contains the correct
 	 * value, so the fault must be due to a stale TLB entry.
 	 *
 	 * Since we always need to flush the TLB anyway in the case where we
 	 * fixed up the L1, or frobbed the L2 PTE, we effectively deal with
 	 * stale TLB entries dynamically.
 	 *
 	 * However, the above condition can ONLY happen if the current L1 is
 	 * being shared. If it happens when the L1 is unshared, it indicates
 	 * that other parts of the pmap are not doing their job WRT managing
 	 * the TLB.
 	 */
 	if (rv == 0 && pm->pm_l1->l1_domain_use_count == 1) {
 		printf("fixup: pm %p, va 0x%lx, ftype %d - nothing to do!\n",
 		    pm, (u_long)va, ftype);
 		printf("fixup: l2 %p, l2b %p, ptep %p, pl1pd %p\n",
 		    l2, l2b, ptep, pl1pd);
 		printf("fixup: pte 0x%x, l1pd 0x%x, last code 0x%x\n",
 		    pte, l1pd, last_fault_code);
 #ifdef DDB
 		Debugger();
 #endif
 	}
 #endif
 
 	cpu_tlb_flushID_SE(va);
 	cpu_cpwait();
 
 	rv = 1;
 
 out:
 	rw_wunlock(&pvh_global_lock);
 	PMAP_UNLOCK(pm);
 	return (rv);
 }
 
 void
 pmap_postinit(void)
 {
 	struct l2_bucket *l2b;
 	struct l1_ttable *l1;
 	pd_entry_t *pl1pt;
 	pt_entry_t *ptep, pte;
 	vm_offset_t va, eva;
 	u_int loop, needed;
 	
 	needed = (maxproc / PMAP_DOMAINS) + ((maxproc % PMAP_DOMAINS) ? 1 : 0);
 	needed -= 1;
 	l1 = malloc(sizeof(*l1) * needed, M_VMPMAP, M_WAITOK);
 
 	for (loop = 0; loop < needed; loop++, l1++) {
 		/* Allocate a L1 page table */
 		va = (vm_offset_t)contigmalloc(L1_TABLE_SIZE, M_VMPMAP, 0, 0x0,
 		    0xffffffff, L1_TABLE_SIZE, 0);
 
 		if (va == 0)
 			panic("Cannot allocate L1 KVM");
 
 		eva = va + L1_TABLE_SIZE;
 		pl1pt = (pd_entry_t *)va;
 		
 		while (va < eva) {
 				l2b = pmap_get_l2_bucket(pmap_kernel(), va);
 				ptep = &l2b->l2b_kva[l2pte_index(va)];
 				pte = *ptep;
 				pte = (pte & ~L2_S_CACHE_MASK) | pte_l2_s_cache_mode_pt;
 				*ptep = pte;
 				PTE_SYNC(ptep);
 				cpu_tlb_flushD_SE(va);
 				
 				va += PAGE_SIZE;
 		}
 		pmap_init_l1(l1, pl1pt);
 	}
 
 
 #ifdef DEBUG
 	printf("pmap_postinit: Allocated %d static L1 descriptor tables\n",
 	    needed);
 #endif
 }
 
 /*
  * This is used to stuff certain critical values into the PCB where they
  * can be accessed quickly from cpu_switch() et al.
  */
 void
 pmap_set_pcb_pagedir(pmap_t pm, struct pcb *pcb)
 {
 	struct l2_bucket *l2b;
 
 	pcb->pcb_pagedir = pm->pm_l1->l1_physaddr;
 	pcb->pcb_dacr = (DOMAIN_CLIENT << (PMAP_DOMAIN_KERNEL * 2)) |
 	    (DOMAIN_CLIENT << (pm->pm_domain * 2));
 
 	if (vector_page < KERNBASE) {
 		pcb->pcb_pl1vec = &pm->pm_l1->l1_kva[L1_IDX(vector_page)];
 		l2b = pmap_get_l2_bucket(pm, vector_page);
 		pcb->pcb_l1vec = l2b->l2b_phys | L1_C_PROTO |
 	 	    L1_C_DOM(pm->pm_domain) | L1_C_DOM(PMAP_DOMAIN_KERNEL);
 	} else
 		pcb->pcb_pl1vec = NULL;
 }
 
 void
 pmap_activate(struct thread *td)
 {
 	pmap_t pm;
 	struct pcb *pcb;
 
 	pm = vmspace_pmap(td->td_proc->p_vmspace);
 	pcb = td->td_pcb;
 
 	critical_enter();
 	pmap_set_pcb_pagedir(pm, pcb);
 
 	if (td == curthread) {
 		u_int cur_dacr, cur_ttb;
 
 		__asm __volatile("mrc p15, 0, %0, c2, c0, 0" : "=r"(cur_ttb));
 		__asm __volatile("mrc p15, 0, %0, c3, c0, 0" : "=r"(cur_dacr));
 
 		cur_ttb &= ~(L1_TABLE_SIZE - 1);
 
 		if (cur_ttb == (u_int)pcb->pcb_pagedir &&
 		    cur_dacr == pcb->pcb_dacr) {
 			/*
 			 * No need to switch address spaces.
 			 */
 			critical_exit();
 			return;
 		}
 
 
 		/*
 		 * We MUST, I repeat, MUST fix up the L1 entry corresponding
 		 * to 'vector_page' in the incoming L1 table before switching
 		 * to it otherwise subsequent interrupts/exceptions (including
 		 * domain faults!) will jump into hyperspace.
 		 */
 		if (pcb->pcb_pl1vec) {
 
 			*pcb->pcb_pl1vec = pcb->pcb_l1vec;
 			/*
 			 * Don't need to PTE_SYNC() at this point since
 			 * cpu_setttb() is about to flush both the cache
 			 * and the TLB.
 			 */
 		}
 
 		cpu_domains(pcb->pcb_dacr);
 		cpu_setttb(pcb->pcb_pagedir);
 	}
 	critical_exit();
 }
 
 static int
 pmap_set_pt_cache_mode(pd_entry_t *kl1, vm_offset_t va)
 {
 	pd_entry_t *pdep, pde;
 	pt_entry_t *ptep, pte;
 	vm_offset_t pa;
 	int rv = 0;
 
 	/*
 	 * Make sure the descriptor itself has the correct cache mode
 	 */
 	pdep = &kl1[L1_IDX(va)];
 	pde = *pdep;
 
 	if (l1pte_section_p(pde)) {
 		if ((pde & L1_S_CACHE_MASK) != pte_l1_s_cache_mode_pt) {
 			*pdep = (pde & ~L1_S_CACHE_MASK) |
 			    pte_l1_s_cache_mode_pt;
 			PTE_SYNC(pdep);
 			cpu_dcache_wbinv_range((vm_offset_t)pdep,
 			    sizeof(*pdep));
 			cpu_l2cache_wbinv_range((vm_offset_t)pdep,
 			    sizeof(*pdep));
 			rv = 1;
 		}
 	} else {
 		pa = (vm_paddr_t)(pde & L1_C_ADDR_MASK);
 		ptep = (pt_entry_t *)kernel_pt_lookup(pa);
 		if (ptep == NULL)
 			panic("pmap_bootstrap: No L2 for L2 @ va %p\n", ptep);
 
 		ptep = &ptep[l2pte_index(va)];
 		pte = *ptep;
 		if ((pte & L2_S_CACHE_MASK) != pte_l2_s_cache_mode_pt) {
 			*ptep = (pte & ~L2_S_CACHE_MASK) |
 			    pte_l2_s_cache_mode_pt;
 			PTE_SYNC(ptep);
 			cpu_dcache_wbinv_range((vm_offset_t)ptep,
 			    sizeof(*ptep));
 			cpu_l2cache_wbinv_range((vm_offset_t)ptep,
 			    sizeof(*ptep));
 			rv = 1;
 		}
 	}
 
 	return (rv);
 }
 
 static void
 pmap_alloc_specials(vm_offset_t *availp, int pages, vm_offset_t *vap,
     pt_entry_t **ptep)
 {
 	vm_offset_t va = *availp;
 	struct l2_bucket *l2b;
 
 	if (ptep) {
 		l2b = pmap_get_l2_bucket(pmap_kernel(), va);
 		if (l2b == NULL)
 			panic("pmap_alloc_specials: no l2b for 0x%x", va);
 
 		*ptep = &l2b->l2b_kva[l2pte_index(va)];
 	}
 
 	*vap = va;
 	*availp = va + (PAGE_SIZE * pages);
 }
 
 /*
  *	Bootstrap the system enough to run with virtual memory.
  *
  *	On the arm this is called after mapping has already been enabled
  *	and just syncs the pmap module with what has already been done.
  *	[We can't call it easily with mapping off since the kernel is not
  *	mapped with PA == VA, hence we would have to relocate every address
  *	from the linked base (virtual) address "KERNBASE" to the actual
  *	(physical) address starting relative to 0]
  */
 #define PMAP_STATIC_L2_SIZE 16
 #ifdef ARM_USE_SMALL_ALLOC
 extern struct mtx smallalloc_mtx;
 #endif
 
 void
 pmap_bootstrap(vm_offset_t firstaddr, vm_offset_t lastaddr, struct pv_addr *l1pt)
 {
 	static struct l1_ttable static_l1;
 	static struct l2_dtable static_l2[PMAP_STATIC_L2_SIZE];
 	struct l1_ttable *l1 = &static_l1;
 	struct l2_dtable *l2;
 	struct l2_bucket *l2b;
 	pd_entry_t pde;
 	pd_entry_t *kernel_l1pt = (pd_entry_t *)l1pt->pv_va;
 	pt_entry_t *ptep;
 	vm_paddr_t pa;
 	vm_offset_t va;
 	vm_size_t size;
 	int l1idx, l2idx, l2next = 0;
 
 	PDEBUG(1, printf("firstaddr = %08x, lastaddr = %08x\n",
 	    firstaddr, lastaddr));
 	
 	virtual_avail = firstaddr;
 	kernel_pmap->pm_l1 = l1;
 	kernel_l1pa = l1pt->pv_pa;
 	
 	/*
 	 * Scan the L1 translation table created by initarm() and create
 	 * the required metadata for all valid mappings found in it.
 	 */
 	for (l1idx = 0; l1idx < (L1_TABLE_SIZE / sizeof(pd_entry_t)); l1idx++) {
 		pde = kernel_l1pt[l1idx];
 
 		/*
 		 * We're only interested in Coarse mappings.
 		 * pmap_extract() can deal with section mappings without
 		 * recourse to checking L2 metadata.
 		 */
 		if ((pde & L1_TYPE_MASK) != L1_TYPE_C)
 			continue;
 
 		/*
 		 * Lookup the KVA of this L2 descriptor table
 		 */
 		pa = (vm_paddr_t)(pde & L1_C_ADDR_MASK);
 		ptep = (pt_entry_t *)kernel_pt_lookup(pa);
 		
 		if (ptep == NULL) {
 			panic("pmap_bootstrap: No L2 for va 0x%x, pa 0x%lx",
 			    (u_int)l1idx << L1_S_SHIFT, (long unsigned int)pa);
 		}
 
 		/*
 		 * Fetch the associated L2 metadata structure.
 		 * Allocate a new one if necessary.
 		 */
 		if ((l2 = kernel_pmap->pm_l2[L2_IDX(l1idx)]) == NULL) {
 			if (l2next == PMAP_STATIC_L2_SIZE)
 				panic("pmap_bootstrap: out of static L2s");
 			kernel_pmap->pm_l2[L2_IDX(l1idx)] = l2 =
 			    &static_l2[l2next++];
 		}
 
 		/*
 		 * One more L1 slot tracked...
 		 */
 		l2->l2_occupancy++;
 
 		/*
 		 * Fill in the details of the L2 descriptor in the
 		 * appropriate bucket.
 		 */
 		l2b = &l2->l2_bucket[L2_BUCKET(l1idx)];
 		l2b->l2b_kva = ptep;
 		l2b->l2b_phys = pa;
 		l2b->l2b_l1idx = l1idx;
 
 		/*
 		 * Establish an initial occupancy count for this descriptor
 		 */
 		for (l2idx = 0;
 		    l2idx < (L2_TABLE_SIZE_REAL / sizeof(pt_entry_t));
 		    l2idx++) {
 			if ((ptep[l2idx] & L2_TYPE_MASK) != L2_TYPE_INV) {
 				l2b->l2b_occupancy++;
 			}
 		}
 
 		/*
 		 * Make sure the descriptor itself has the correct cache mode.
 		 * If not, fix it, but whine about the problem. Port-meisters
 		 * should consider this a clue to fix up their initarm()
 		 * function. :)
 		 */
 		if (pmap_set_pt_cache_mode(kernel_l1pt, (vm_offset_t)ptep)) {
 			printf("pmap_bootstrap: WARNING! wrong cache mode for "
 			    "L2 pte @ %p\n", ptep);
 		}
 	}
 
 	
 	/*
 	 * Ensure the primary (kernel) L1 has the correct cache mode for
 	 * a page table. Bitch if it is not correctly set.
 	 */
 	for (va = (vm_offset_t)kernel_l1pt;
 	    va < ((vm_offset_t)kernel_l1pt + L1_TABLE_SIZE); va += PAGE_SIZE) {
 		if (pmap_set_pt_cache_mode(kernel_l1pt, va))
 			printf("pmap_bootstrap: WARNING! wrong cache mode for "
 			    "primary L1 @ 0x%x\n", va);
 	}
 
 	cpu_dcache_wbinv_all();
 	cpu_l2cache_wbinv_all();
 	cpu_tlb_flushID();
 	cpu_cpwait();
 
 	PMAP_LOCK_INIT(kernel_pmap);
 	CPU_FILL(&kernel_pmap->pm_active);
 	kernel_pmap->pm_domain = PMAP_DOMAIN_KERNEL;
 	TAILQ_INIT(&kernel_pmap->pm_pvlist);
 
  	/*
 	 * Initialize the global pv list lock.
 	 */
 	rw_init_flags(&pvh_global_lock, "pmap pv global", RW_RECURSE);
 	
 	/*
 	 * Reserve some special page table entries/VA space for temporary
 	 * mapping of pages.
 	 */
 #define SYSMAP(c, p, v, n)						\
     v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
 
 	pmap_alloc_specials(&virtual_avail, 1, &csrcp, &csrc_pte);
 	pmap_set_pt_cache_mode(kernel_l1pt, (vm_offset_t)csrc_pte);
 	pmap_alloc_specials(&virtual_avail, 1, &cdstp, &cdst_pte);
 	pmap_set_pt_cache_mode(kernel_l1pt, (vm_offset_t)cdst_pte);
 	size = ((lastaddr - pmap_curmaxkvaddr) + L1_S_OFFSET) / L1_S_SIZE;
 	pmap_alloc_specials(&virtual_avail,
 	    round_page(size * L2_TABLE_SIZE_REAL) / PAGE_SIZE,
 	    &pmap_kernel_l2ptp_kva, NULL);
 	
 	size = (size + (L2_BUCKET_SIZE - 1)) / L2_BUCKET_SIZE;
 	pmap_alloc_specials(&virtual_avail,
 	    round_page(size * sizeof(struct l2_dtable)) / PAGE_SIZE,
 	    &pmap_kernel_l2dtable_kva, NULL);
 
 	pmap_alloc_specials(&virtual_avail,
 	    1, (vm_offset_t*)&_tmppt, NULL);
 	pmap_alloc_specials(&virtual_avail,
 	    MAXDUMPPGS, (vm_offset_t *)&crashdumpmap, NULL);
 	SLIST_INIT(&l1_list);
 	TAILQ_INIT(&l1_lru_list);
 	mtx_init(&l1_lru_lock, "l1 list lock", NULL, MTX_DEF);
 	pmap_init_l1(l1, kernel_l1pt);
 	cpu_dcache_wbinv_all();
 	cpu_l2cache_wbinv_all();
 
 	virtual_avail = round_page(virtual_avail);
 	virtual_end = lastaddr;
 	kernel_vm_end = pmap_curmaxkvaddr;
 	arm_nocache_startaddr = lastaddr;
 	mtx_init(&cmtx, "TMP mappings mtx", NULL, MTX_DEF);
 
 #ifdef ARM_USE_SMALL_ALLOC
 	mtx_init(&smallalloc_mtx, "Small alloc page list", NULL, MTX_DEF);
 	arm_init_smallalloc();
 #endif
 	pmap_set_pcb_pagedir(kernel_pmap, thread0.td_pcb);
 }
 
 /***************************************************
  * Pmap allocation/deallocation routines.
  ***************************************************/
 
 /*
  * Release any resources held by the given physical map.
  * Called when a pmap initialized by pmap_pinit is being released.
  * Should only be called if the map contains no valid mappings.
  */
 void
 pmap_release(pmap_t pmap)
 {
 	struct pcb *pcb;
 	
 	pmap_idcache_wbinv_all(pmap);
 	cpu_l2cache_wbinv_all();
 	pmap_tlb_flushID(pmap);
 	cpu_cpwait();
 	if (vector_page < KERNBASE) {
 		struct pcb *curpcb = PCPU_GET(curpcb);
 		pcb = thread0.td_pcb;
 		if (pmap_is_current(pmap)) {
 			/*
  			 * Frob the L1 entry corresponding to the vector
 			 * page so that it contains the kernel pmap's domain
 			 * number. This will ensure pmap_remove() does not
 			 * pull the current vector page out from under us.
 			 */
 			critical_enter();
 			*pcb->pcb_pl1vec = pcb->pcb_l1vec;
 			cpu_domains(pcb->pcb_dacr);
 			cpu_setttb(pcb->pcb_pagedir);
 			critical_exit();
 		}
 		pmap_remove(pmap, vector_page, vector_page + PAGE_SIZE);
 		/*
 		 * Make sure cpu_switch(), et al, DTRT. This is safe to do
 		 * since this process has no remaining mappings of its own.
 		 */
 		curpcb->pcb_pl1vec = pcb->pcb_pl1vec;
 		curpcb->pcb_l1vec = pcb->pcb_l1vec;
 		curpcb->pcb_dacr = pcb->pcb_dacr;
 		curpcb->pcb_pagedir = pcb->pcb_pagedir;
 
 	}
 	pmap_free_l1(pmap);
 	PMAP_LOCK_DESTROY(pmap);
 	
 	dprintf("pmap_release()\n");
 }
 
 
 
 /*
  * Helper function for pmap_grow_l2_bucket()
  */
 static __inline int
 pmap_grow_map(vm_offset_t va, pt_entry_t cache_mode, vm_paddr_t *pap)
 {
 	struct l2_bucket *l2b;
 	pt_entry_t *ptep;
 	vm_paddr_t pa;
 	struct vm_page *pg;
 	
 	pg = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ | VM_ALLOC_WIRED);
 	if (pg == NULL)
 		return (1);
 	pa = VM_PAGE_TO_PHYS(pg);
 
 	if (pap)
 		*pap = pa;
 
 	l2b = pmap_get_l2_bucket(pmap_kernel(), va);
 
 	ptep = &l2b->l2b_kva[l2pte_index(va)];
 	*ptep = L2_S_PROTO | pa | cache_mode |
 	    L2_S_PROT(PTE_KERNEL, VM_PROT_READ | VM_PROT_WRITE);
 	PTE_SYNC(ptep);
 	return (0);
 }
 
 /*
  * This is the same as pmap_alloc_l2_bucket(), except that it is only
  * used by pmap_growkernel().
  */
 static __inline struct l2_bucket *
 pmap_grow_l2_bucket(pmap_t pm, vm_offset_t va)
 {
 	struct l2_dtable *l2;
 	struct l2_bucket *l2b;
 	struct l1_ttable *l1;
 	pd_entry_t *pl1pd;
 	u_short l1idx;
 	vm_offset_t nva;
 
 	l1idx = L1_IDX(va);
 
 	if ((l2 = pm->pm_l2[L2_IDX(l1idx)]) == NULL) {
 		/*
 		 * No mapping at this address, as there is
 		 * no entry in the L1 table.
 		 * Need to allocate a new l2_dtable.
 		 */
 		nva = pmap_kernel_l2dtable_kva;
 		if ((nva & PAGE_MASK) == 0) {
 			/*
 			 * Need to allocate a backing page
 			 */
 			if (pmap_grow_map(nva, pte_l2_s_cache_mode, NULL))
 				return (NULL);
 		}
 
 		l2 = (struct l2_dtable *)nva;
 		nva += sizeof(struct l2_dtable);
 
 		if ((nva & PAGE_MASK) < (pmap_kernel_l2dtable_kva &
 		    PAGE_MASK)) {
 			/*
 			 * The new l2_dtable straddles a page boundary.
 			 * Map in another page to cover it.
 			 */
 			if (pmap_grow_map(nva, pte_l2_s_cache_mode, NULL))
 				return (NULL);
 		}
 
 		pmap_kernel_l2dtable_kva = nva;
 
 		/*
 		 * Link it into the parent pmap
 		 */
 		pm->pm_l2[L2_IDX(l1idx)] = l2;
 		memset(l2, 0, sizeof(*l2));
 	}
 
 	l2b = &l2->l2_bucket[L2_BUCKET(l1idx)];
 
 	/*
 	 * Fetch pointer to the L2 page table associated with the address.
 	 */
 	if (l2b->l2b_kva == NULL) {
 		pt_entry_t *ptep;
 
 		/*
 		 * No L2 page table has been allocated. Chances are, this
 		 * is because we just allocated the l2_dtable, above.
 		 */
 		nva = pmap_kernel_l2ptp_kva;
 		ptep = (pt_entry_t *)nva;
 		if ((nva & PAGE_MASK) == 0) {
 			/*
 			 * Need to allocate a backing page
 			 */
 			if (pmap_grow_map(nva, pte_l2_s_cache_mode_pt,
 			    &pmap_kernel_l2ptp_phys))
 				return (NULL);
 			PTE_SYNC_RANGE(ptep, PAGE_SIZE / sizeof(pt_entry_t));
 		}
 		memset(ptep, 0, L2_TABLE_SIZE_REAL);
 		l2->l2_occupancy++;
 		l2b->l2b_kva = ptep;
 		l2b->l2b_l1idx = l1idx;
 		l2b->l2b_phys = pmap_kernel_l2ptp_phys;
 
 		pmap_kernel_l2ptp_kva += L2_TABLE_SIZE_REAL;
 		pmap_kernel_l2ptp_phys += L2_TABLE_SIZE_REAL;
 	}
 
 	/* Distribute new L1 entry to all other L1s */
 	SLIST_FOREACH(l1, &l1_list, l1_link) {
 			pl1pd = &l1->l1_kva[L1_IDX(va)];
 			*pl1pd = l2b->l2b_phys | L1_C_DOM(PMAP_DOMAIN_KERNEL) |
 			    L1_C_PROTO;
 			PTE_SYNC(pl1pd);
 	}
 
 	return (l2b);
 }
 
 
 /*
  * grow the number of kernel page table entries, if needed
  */
 void
 pmap_growkernel(vm_offset_t addr)
 {
 	pmap_t kpm = pmap_kernel();
 
 	if (addr <= pmap_curmaxkvaddr)
 		return;		/* we are OK */
 
 	/*
 	 * whoops!   we need to add kernel PTPs
 	 */
 
 	/* Map 1MB at a time */
 	for (; pmap_curmaxkvaddr < addr; pmap_curmaxkvaddr += L1_S_SIZE)
 		pmap_grow_l2_bucket(kpm, pmap_curmaxkvaddr);
 
 	/*
 	 * flush out the cache, expensive but growkernel will happen so
 	 * rarely
 	 */
 	cpu_dcache_wbinv_all();
 	cpu_l2cache_wbinv_all();
 	cpu_tlb_flushD();
 	cpu_cpwait();
 	kernel_vm_end = pmap_curmaxkvaddr;
 }
 
 
 /*
  * Remove all pages from specified address space
  * this aids process exit speeds.  Also, this code
  * is special cased for current process only, but
  * can have the more generic (and slightly slower)
  * mode enabled.  This is much faster than pmap_remove
  * in the case of running down an entire address space.
  */
 void
 pmap_remove_pages(pmap_t pmap)
 {
 	struct pv_entry *pv, *npv;
 	struct l2_bucket *l2b = NULL;
 	vm_page_t m;
 	pt_entry_t *pt;
 	
 	rw_wlock(&pvh_global_lock);
 	PMAP_LOCK(pmap);
 	cpu_idcache_wbinv_all();
 	cpu_l2cache_wbinv_all();
 	for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) {
 		if (pv->pv_flags & PVF_WIRED || pv->pv_flags & PVF_UNMAN) {
 			/* Cannot remove wired or unmanaged pages now. */
 			npv = TAILQ_NEXT(pv, pv_plist);
 			continue;
 		}
 		pmap->pm_stats.resident_count--;
 		l2b = pmap_get_l2_bucket(pmap, pv->pv_va);
 		KASSERT(l2b != NULL, ("No L2 bucket in pmap_remove_pages"));
 		pt = &l2b->l2b_kva[l2pte_index(pv->pv_va)];
 		m = PHYS_TO_VM_PAGE(*pt & L2_ADDR_MASK);
 #ifdef ARM_USE_SMALL_ALLOC
 		KASSERT((vm_offset_t)m >= alloc_firstaddr, ("Trying to access non-existent page va %x pte %x", pv->pv_va, *pt));
 #else
 		KASSERT((vm_offset_t)m >= KERNBASE, ("Trying to access non-existent page va %x pte %x", pv->pv_va, *pt));
 #endif
 		*pt = 0;
 		PTE_SYNC(pt);
 		npv = TAILQ_NEXT(pv, pv_plist);
 		pmap_nuke_pv(m, pmap, pv);
 		if (TAILQ_EMPTY(&m->md.pv_list))
 			vm_page_aflag_clear(m, PGA_WRITEABLE);
 		pmap_free_pv_entry(pv);
 		pmap_free_l2_bucket(pmap, l2b, 1);
 	}
 	rw_wunlock(&pvh_global_lock);
 	cpu_tlb_flushID();
 	cpu_cpwait();
 	PMAP_UNLOCK(pmap);
 }
 
 
 /***************************************************
  * Low level mapping routines.....
  ***************************************************/
 
 #ifdef ARM_HAVE_SUPERSECTIONS
 /* Map a super section into the KVA. */
 
 void
 pmap_kenter_supersection(vm_offset_t va, uint64_t pa, int flags)
 {
 	pd_entry_t pd = L1_S_PROTO | L1_S_SUPERSEC | (pa & L1_SUP_FRAME) |
 	    (((pa >> 32) & 0xf) << 20) | L1_S_PROT(PTE_KERNEL,
 	    VM_PROT_READ|VM_PROT_WRITE) | L1_S_DOM(PMAP_DOMAIN_KERNEL);
 	struct l1_ttable *l1;	
 	vm_offset_t va0, va_end;
 
 	KASSERT(((va | pa) & L1_SUP_OFFSET) == 0,
 	    ("Not a valid super section mapping"));
 	if (flags & SECTION_CACHE)
 		pd |= pte_l1_s_cache_mode;
 	else if (flags & SECTION_PT)
 		pd |= pte_l1_s_cache_mode_pt;
 	va0 = va & L1_SUP_FRAME;
 	va_end = va + L1_SUP_SIZE;
 	SLIST_FOREACH(l1, &l1_list, l1_link) {
 		va = va0;
 		for (; va < va_end; va += L1_S_SIZE) {
 			l1->l1_kva[L1_IDX(va)] = pd;
 			PTE_SYNC(&l1->l1_kva[L1_IDX(va)]);
 		}
 	}
 }
 #endif
 
 /* Map a section into the KVA. */
 
 void
 pmap_kenter_section(vm_offset_t va, vm_offset_t pa, int flags)
 {
 	pd_entry_t pd = L1_S_PROTO | pa | L1_S_PROT(PTE_KERNEL,
 	    VM_PROT_READ|VM_PROT_WRITE) | L1_S_DOM(PMAP_DOMAIN_KERNEL);
 	struct l1_ttable *l1;
 
 	KASSERT(((va | pa) & L1_S_OFFSET) == 0,
 	    ("Not a valid section mapping"));
 	if (flags & SECTION_CACHE)
 		pd |= pte_l1_s_cache_mode;
 	else if (flags & SECTION_PT)
 		pd |= pte_l1_s_cache_mode_pt;
 	SLIST_FOREACH(l1, &l1_list, l1_link) {
 		l1->l1_kva[L1_IDX(va)] = pd;
 		PTE_SYNC(&l1->l1_kva[L1_IDX(va)]);
 	}
 }
 
 /*
  * Make a temporary mapping for a physical address.  This is only intended
  * to be used for panic dumps.
  */
 void *
 pmap_kenter_temp(vm_paddr_t pa, int i)
 {
 	vm_offset_t va;
 
 	va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
 	pmap_kenter(va, pa);
 	return ((void *)crashdumpmap);
 }
 
 /*
  * add a wired page to the kva
  * note that in order for the mapping to take effect -- you
  * should do a invltlb after doing the pmap_kenter...
  */
 static PMAP_INLINE void
 pmap_kenter_internal(vm_offset_t va, vm_offset_t pa, int flags)
 {
 	struct l2_bucket *l2b;
 	pt_entry_t *pte;
 	pt_entry_t opte;
 	struct pv_entry *pve;
 	vm_page_t m;
 
 	PDEBUG(1, printf("pmap_kenter: va = %08x, pa = %08x\n",
 	    (uint32_t) va, (uint32_t) pa));
 
 
 	l2b = pmap_get_l2_bucket(pmap_kernel(), va);
 	if (l2b == NULL)
 		l2b = pmap_grow_l2_bucket(pmap_kernel(), va);
 	KASSERT(l2b != NULL, ("No L2 Bucket"));
 	pte = &l2b->l2b_kva[l2pte_index(va)];
 	opte = *pte;
 	PDEBUG(1, printf("pmap_kenter: pte = %08x, opte = %08x, npte = %08x\n",
 	    (uint32_t) pte, opte, *pte));
 	if (l2pte_valid(opte)) {
 		pmap_kremove(va);
 	} else {
 		if (opte == 0)
 			l2b->l2b_occupancy++;
 	}
 	*pte = L2_S_PROTO | pa | L2_S_PROT(PTE_KERNEL,
 	    VM_PROT_READ | VM_PROT_WRITE);
 	if (flags & KENTER_CACHE)
 		*pte |= pte_l2_s_cache_mode;
 	if (flags & KENTER_USER)
 		*pte |= L2_S_PROT_U;
 	PTE_SYNC(pte);
 
 	/*
 	 * A kernel mapping may not be the page's only mapping, so create a PV
 	 * entry to ensure proper caching.
  	 *
 	 * The existence test for the pvzone is used to delay the recording of
 	 * kernel mappings until the VM system is fully initialized.
 	 *
 	 * This expects the physical memory to have a vm_page_array entry.
 	 */
 	if (pvzone != NULL && (m = vm_phys_paddr_to_vm_page(pa)) != NULL) {
 		rw_wlock(&pvh_global_lock);
 		if (!TAILQ_EMPTY(&m->md.pv_list) || m->md.pv_kva != 0) {
 			if ((pve = pmap_get_pv_entry()) == NULL)
 				panic("pmap_kenter_internal: no pv entries");	
 			PMAP_LOCK(pmap_kernel());
 			pmap_enter_pv(m, pve, pmap_kernel(), va,
 			    PVF_WRITE | PVF_UNMAN);
 			pmap_fix_cache(m, pmap_kernel(), va);
 			PMAP_UNLOCK(pmap_kernel());
 		} else {
 			m->md.pv_kva = va;
 		}
 		rw_wunlock(&pvh_global_lock);
 	}
 }
 
 void
 pmap_kenter(vm_offset_t va, vm_paddr_t pa)
 {
 	pmap_kenter_internal(va, pa, KENTER_CACHE);
 }
 
 void
 pmap_kenter_nocache(vm_offset_t va, vm_paddr_t pa)
 {
 
 	pmap_kenter_internal(va, pa, 0);
 }
 
 void
 pmap_kenter_user(vm_offset_t va, vm_paddr_t pa)
 {
 
 	pmap_kenter_internal(va, pa, KENTER_CACHE|KENTER_USER);
 	/*
 	 * Call pmap_fault_fixup now, to make sure we'll have no exception
 	 * at the first use of the new address, or bad things will happen,
 	 * as we use one of these addresses in the exception handlers.
 	 */
 	pmap_fault_fixup(pmap_kernel(), va, VM_PROT_READ|VM_PROT_WRITE, 1);
 }
 
 vm_paddr_t
 pmap_kextract(vm_offset_t va)
 {
 
 	return (pmap_extract_locked(kernel_pmap, va));
 }
 
 /*
  * remove a page from the kernel pagetables
  */
 void
 pmap_kremove(vm_offset_t va)
 {
 	struct l2_bucket *l2b;
 	pt_entry_t *pte, opte;
 	struct pv_entry *pve;
 	vm_page_t m;
 	vm_offset_t pa;
 		
 	l2b = pmap_get_l2_bucket(pmap_kernel(), va);
 	if (!l2b)
 		return;
 	KASSERT(l2b != NULL, ("No L2 Bucket"));
 	pte = &l2b->l2b_kva[l2pte_index(va)];
 	opte = *pte;
 	if (l2pte_valid(opte)) {
 			/* pa = vtophs(va) taken from pmap_extract() */
 		switch (opte & L2_TYPE_MASK) {
 		case L2_TYPE_L:
 			pa = (opte & L2_L_FRAME) | (va & L2_L_OFFSET);
 			break;
 		default:
 			pa = (opte & L2_S_FRAME) | (va & L2_S_OFFSET);
 			break;
 		}
 			/* note: should never have to remove an allocation
 			 * before the pvzone is initialized.
 			 */
 		rw_wlock(&pvh_global_lock);
 		PMAP_LOCK(pmap_kernel());
 		if (pvzone != NULL && (m = vm_phys_paddr_to_vm_page(pa)) &&
 		    (pve = pmap_remove_pv(m, pmap_kernel(), va)))
 			pmap_free_pv_entry(pve);
 		PMAP_UNLOCK(pmap_kernel());
 		rw_wunlock(&pvh_global_lock);
 		va = va & ~PAGE_MASK;
 		cpu_dcache_wbinv_range(va, PAGE_SIZE);
 		cpu_l2cache_wbinv_range(va, PAGE_SIZE);
 		cpu_tlb_flushD_SE(va);
 		cpu_cpwait();
 		*pte = 0;
 	}
 }
 
 
 /*
  *	Used to map a range of physical addresses into kernel
  *	virtual address space.
  *
  *	The value passed in '*virt' is a suggested virtual address for
  *	the mapping. Architectures which can support a direct-mapped
  *	physical to virtual region can return the appropriate address
  *	within that region, leaving '*virt' unchanged. Other
  *	architectures should map the pages starting at '*virt' and
  *	update '*virt' with the first usable address after the mapped
  *	region.
  */
 vm_offset_t
 pmap_map(vm_offset_t *virt, vm_offset_t start, vm_offset_t end, int prot)
 {
 #ifdef ARM_USE_SMALL_ALLOC
 	return (arm_ptovirt(start));
 #else
 	vm_offset_t sva = *virt;
 	vm_offset_t va = sva;
 
 	PDEBUG(1, printf("pmap_map: virt = %08x, start = %08x, end = %08x, "
 	    "prot = %d\n", (uint32_t) *virt, (uint32_t) start, (uint32_t) end,
 	    prot));
 
 	while (start < end) {
 		pmap_kenter(va, start);
 		va += PAGE_SIZE;
 		start += PAGE_SIZE;
 	}
 	*virt = va;
 	return (sva);
 #endif
 }
 
 static void
 pmap_wb_page(vm_page_t m)
 {
 	struct pv_entry *pv;
 
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list)
 	    pmap_dcache_wb_range(pv->pv_pmap, pv->pv_va, PAGE_SIZE, FALSE,
 		(pv->pv_flags & PVF_WRITE) == 0);
 }
 
 static void
 pmap_inv_page(vm_page_t m)
 {
 	struct pv_entry *pv;
 
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list)
 	    pmap_dcache_wb_range(pv->pv_pmap, pv->pv_va, PAGE_SIZE, TRUE, TRUE);
 }
 /*
  * Add a list of wired pages to the kva
  * this routine is only used for temporary
  * kernel mappings that do not need to have
  * page modification or references recorded.
  * Note that old mappings are simply written
  * over.  The page *must* be wired.
  */
 void
 pmap_qenter(vm_offset_t va, vm_page_t *m, int count)
 {
 	int i;
 
 	for (i = 0; i < count; i++) {
 		pmap_wb_page(m[i]);
 		pmap_kenter_internal(va, VM_PAGE_TO_PHYS(m[i]),
 		    KENTER_CACHE);
 		va += PAGE_SIZE;
 	}
 }
 
 
 /*
  * this routine jerks page mappings from the
  * kernel -- it is meant only for temporary mappings.
  */
 void
 pmap_qremove(vm_offset_t va, int count)
 {
 	vm_paddr_t pa;
 	int i;
 
 	for (i = 0; i < count; i++) {
 		pa = vtophys(va);
 		if (pa) {
 			pmap_inv_page(PHYS_TO_VM_PAGE(pa));
 			pmap_kremove(va);
 		}
 		va += PAGE_SIZE;
 	}
 }
 
 
 /*
  * pmap_object_init_pt preloads the ptes for a given object
  * into the specified pmap.  This eliminates the blast of soft
  * faults on process startup and immediately after an mmap.
  */
 void
 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
     vm_pindex_t pindex, vm_size_t size)
 {
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
 	    ("pmap_object_init_pt: non-device object"));
 }
 
 
 /*
  *	pmap_is_prefaultable:
  *
  *	Return whether or not the specified virtual address is elgible
  *	for prefault.
  */
 boolean_t
 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
 {
 	pd_entry_t *pde;
 	pt_entry_t *pte;
 
 	if (!pmap_get_pde_pte(pmap, addr, &pde, &pte))
 		return (FALSE);
 	KASSERT(pte != NULL, ("Valid mapping but no pte ?"));
 	if (*pte == 0)
 		return (TRUE);
 	return (FALSE);
 }
 
 /*
  * Fetch pointers to the PDE/PTE for the given pmap/VA pair.
  * Returns TRUE if the mapping exists, else FALSE.
  *
  * NOTE: This function is only used by a couple of arm-specific modules.
  * It is not safe to take any pmap locks here, since we could be right
  * in the middle of debugging the pmap anyway...
  *
  * It is possible for this routine to return FALSE even though a valid
  * mapping does exist. This is because we don't lock, so the metadata
  * state may be inconsistent.
  *
  * NOTE: We can return a NULL *ptp in the case where the L1 pde is
  * a "section" mapping.
  */
 boolean_t
 pmap_get_pde_pte(pmap_t pm, vm_offset_t va, pd_entry_t **pdp, pt_entry_t **ptp)
 {
 	struct l2_dtable *l2;
 	pd_entry_t *pl1pd, l1pd;
 	pt_entry_t *ptep;
 	u_short l1idx;
 
 	if (pm->pm_l1 == NULL)
 		return (FALSE);
 
 	l1idx = L1_IDX(va);
 	*pdp = pl1pd = &pm->pm_l1->l1_kva[l1idx];
 	l1pd = *pl1pd;
 
 	if (l1pte_section_p(l1pd)) {
 		*ptp = NULL;
 		return (TRUE);
 	}
 
 	if (pm->pm_l2 == NULL)
 		return (FALSE);
 
 	l2 = pm->pm_l2[L2_IDX(l1idx)];
 
 	if (l2 == NULL ||
 	    (ptep = l2->l2_bucket[L2_BUCKET(l1idx)].l2b_kva) == NULL) {
 		return (FALSE);
 	}
 
 	*ptp = &ptep[l2pte_index(va)];
 	return (TRUE);
 }
 
 /*
  *      Routine:        pmap_remove_all
  *      Function:
  *              Removes this physical page from
  *              all physical maps in which it resides.
  *              Reflects back modify bits to the pager.
  *
  *      Notes:
  *              Original versions of this routine were very
  *              inefficient because they iteratively called
  *              pmap_remove (slow...)
  */
 void
 pmap_remove_all(vm_page_t m)
 {
 	pv_entry_t pv;
 	pt_entry_t *ptep;
 	struct l2_bucket *l2b;
 	boolean_t flush = FALSE;
 	pmap_t curpm;
 	int flags = 0;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_remove_all: page %p is not managed", m));
 	if (TAILQ_EMPTY(&m->md.pv_list))
 		return;
 	rw_wlock(&pvh_global_lock);
 	pmap_remove_write(m);
 	curpm = vmspace_pmap(curproc->p_vmspace);
 	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
 		if (flush == FALSE && (pv->pv_pmap == curpm ||
 		    pv->pv_pmap == pmap_kernel()))
 			flush = TRUE;
 
 		PMAP_LOCK(pv->pv_pmap);
 		/*
 		 * Cached contents were written-back in pmap_remove_write(),
 		 * but we still have to invalidate the cache entry to make
 		 * sure stale data are not retrieved when another page will be
 		 * mapped under this virtual address.
 		 */
 		if (pmap_is_current(pv->pv_pmap)) {
 			cpu_dcache_inv_range(pv->pv_va, PAGE_SIZE);
 			if (pmap_has_valid_mapping(pv->pv_pmap, pv->pv_va))
 				cpu_l2cache_inv_range(pv->pv_va, PAGE_SIZE);
 		}
 
 		if (pv->pv_flags & PVF_UNMAN) {
 			/* remove the pv entry, but do not remove the mapping
 			 * and remember this is a kernel mapped page
 			 */
 			m->md.pv_kva = pv->pv_va;
 		} else {
 			/* remove the mapping and pv entry */
 			l2b = pmap_get_l2_bucket(pv->pv_pmap, pv->pv_va);
 			KASSERT(l2b != NULL, ("No l2 bucket"));
 			ptep = &l2b->l2b_kva[l2pte_index(pv->pv_va)];
 			*ptep = 0;
 			PTE_SYNC_CURRENT(pv->pv_pmap, ptep);
 			pmap_free_l2_bucket(pv->pv_pmap, l2b, 1);
 			pv->pv_pmap->pm_stats.resident_count--;
 			flags |= pv->pv_flags;
 		}
 		pmap_nuke_pv(m, pv->pv_pmap, pv);
 		PMAP_UNLOCK(pv->pv_pmap);
 		pmap_free_pv_entry(pv);
 	}
 
 	if (flush) {
 		if (PV_BEEN_EXECD(flags))
 			pmap_tlb_flushID(curpm);
 		else
 			pmap_tlb_flushD(curpm);
 	}
 	vm_page_aflag_clear(m, PGA_WRITEABLE);
 	rw_wunlock(&pvh_global_lock);
 }
 
 
 /*
  *	Set the physical protection on the
  *	specified range of this map as requested.
  */
 void
 pmap_protect(pmap_t pm, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
 {
 	struct l2_bucket *l2b;
 	pt_entry_t *ptep, pte;
 	vm_offset_t next_bucket;
 	u_int flags;
 	int flush;
 
 	CTR4(KTR_PMAP, "pmap_protect: pmap %p sva 0x%08x eva 0x%08x prot %x",
 	    pm, sva, eva, prot);
 
 	if ((prot & VM_PROT_READ) == 0) {
 		pmap_remove(pm, sva, eva);
 		return;
 	}
 
 	if (prot & VM_PROT_WRITE) {
 		/*
 		 * If this is a read->write transition, just ignore it and let
 		 * vm_fault() take care of it later.
 		 */
 		return;
 	}
 
 	rw_wlock(&pvh_global_lock);
 	PMAP_LOCK(pm);
 
 	/*
 	 * OK, at this point, we know we're doing write-protect operation.
 	 * If the pmap is active, write-back the range.
 	 */
 	pmap_dcache_wb_range(pm, sva, eva - sva, FALSE, FALSE);
 
 	flush = ((eva - sva) >= (PAGE_SIZE * 4)) ? 0 : -1;
 	flags = 0;
 
 	while (sva < eva) {
 		next_bucket = L2_NEXT_BUCKET(sva);
 		if (next_bucket > eva)
 			next_bucket = eva;
 
 		l2b = pmap_get_l2_bucket(pm, sva);
 		if (l2b == NULL) {
 			sva = next_bucket;
 			continue;
 		}
 
 		ptep = &l2b->l2b_kva[l2pte_index(sva)];
 
 		while (sva < next_bucket) {
 			if ((pte = *ptep) != 0 && (pte & L2_S_PROT_W) != 0) {
 				struct vm_page *pg;
 				u_int f;
 
 				pg = PHYS_TO_VM_PAGE(l2pte_pa(pte));
 				pte &= ~L2_S_PROT_W;
 				*ptep = pte;
 				PTE_SYNC(ptep);
 
 				if (!(pg->oflags & VPO_UNMANAGED)) {
 					f = pmap_modify_pv(pg, pm, sva,
 					    PVF_WRITE, 0);
 					if (f & PVF_WRITE)
 						vm_page_dirty(pg);
 				} else
 					f = 0;
 
 				if (flush >= 0) {
 					flush++;
 					flags |= f;
 				} else
 				if (PV_BEEN_EXECD(f))
 					pmap_tlb_flushID_SE(pm, sva);
 				else
 				if (PV_BEEN_REFD(f))
 					pmap_tlb_flushD_SE(pm, sva);
 			}
 
 			sva += PAGE_SIZE;
 			ptep++;
 		}
 	}
 
 
 	if (flush) {
 		if (PV_BEEN_EXECD(flags))
 			pmap_tlb_flushID(pm);
 		else
 		if (PV_BEEN_REFD(flags))
 			pmap_tlb_flushD(pm);
 	}
 	rw_wunlock(&pvh_global_lock);
 
  	PMAP_UNLOCK(pm);
 }
 
 
 /*
  *	Insert the given physical page (p) at
  *	the specified virtual address (v) in the
  *	target physical map with the protection requested.
  *
  *	If specified, the page will be wired down, meaning
  *	that the related pte can not be reclaimed.
  *
  *	NB:  This is the only routine which MAY NOT lazy-evaluate
  *	or lose information.  That is, this routine must actually
  *	insert this page into the given map NOW.
  */
 
 void
 pmap_enter(pmap_t pmap, vm_offset_t va, vm_prot_t access, vm_page_t m,
     vm_prot_t prot, boolean_t wired)
 {
 
 	rw_wlock(&pvh_global_lock);
 	PMAP_LOCK(pmap);
 	pmap_enter_locked(pmap, va, m, prot, wired, M_WAITOK);
 	rw_wunlock(&pvh_global_lock);
  	PMAP_UNLOCK(pmap);
 }
 
 /*
  *	The pvh global and pmap locks must be held.
  */
 static void
 pmap_enter_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
     boolean_t wired, int flags)
 {
 	struct l2_bucket *l2b = NULL;
 	struct vm_page *opg;
 	struct pv_entry *pve = NULL;
 	pt_entry_t *ptep, npte, opte;
 	u_int nflags;
 	u_int oflags;
 	vm_paddr_t pa;
 
 	PMAP_ASSERT_LOCKED(pmap);
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	if (va == vector_page) {
 		pa = systempage.pv_pa;
 		m = NULL;
 	} else {
 		KASSERT((m->oflags & (VPO_UNMANAGED | VPO_BUSY)) != 0 ||
 		    (flags & M_NOWAIT) != 0,
 		    ("pmap_enter_locked: page %p is not busy", m));
 		pa = VM_PAGE_TO_PHYS(m);
 	}
 	nflags = 0;
 	if (prot & VM_PROT_WRITE)
 		nflags |= PVF_WRITE;
 	if (prot & VM_PROT_EXECUTE)
 		nflags |= PVF_EXEC;
 	if (wired)
 		nflags |= PVF_WIRED;
 	PDEBUG(1, printf("pmap_enter: pmap = %08x, va = %08x, m = %08x, prot = %x, "
 	    "wired = %x\n", (uint32_t) pmap, va, (uint32_t) m, prot, wired));
 
 	if (pmap == pmap_kernel()) {
 		l2b = pmap_get_l2_bucket(pmap, va);
 		if (l2b == NULL)
 			l2b = pmap_grow_l2_bucket(pmap, va);
 	} else {
 do_l2b_alloc:
 		l2b = pmap_alloc_l2_bucket(pmap, va);
 		if (l2b == NULL) {
 			if (flags & M_WAITOK) {
 				PMAP_UNLOCK(pmap);
 				rw_wunlock(&pvh_global_lock);
 				VM_WAIT;
 				rw_wlock(&pvh_global_lock);
 				PMAP_LOCK(pmap);
 				goto do_l2b_alloc;
 			}
 			return;
 		}
 	}
 
 	ptep = &l2b->l2b_kva[l2pte_index(va)];
 
 	opte = *ptep;
 	npte = pa;
 	oflags = 0;
 	if (opte) {
 		/*
 		 * There is already a mapping at this address.
 		 * If the physical address is different, lookup the
 		 * vm_page.
 		 */
 		if (l2pte_pa(opte) != pa)
 			opg = PHYS_TO_VM_PAGE(l2pte_pa(opte));
 		else
 			opg = m;
 	} else
 		opg = NULL;
 
 	if ((prot & (VM_PROT_ALL)) ||
 	    (!m || m->md.pvh_attrs & PVF_REF)) {
 		/*
 		 * - The access type indicates that we don't need
 		 *   to do referenced emulation.
 		 * OR
 		 * - The physical page has already been referenced
 		 *   so no need to re-do referenced emulation here.
 		 */
 		npte |= L2_S_PROTO;
 		
 		nflags |= PVF_REF;
 		
 		if (m && ((prot & VM_PROT_WRITE) != 0 ||
 		    (m->md.pvh_attrs & PVF_MOD))) {
 			/*
 			 * This is a writable mapping, and the
 			 * page's mod state indicates it has
 			 * already been modified. Make it
 			 * writable from the outset.
 			 */
 			nflags |= PVF_MOD;
 			if (!(m->md.pvh_attrs & PVF_MOD))
 				vm_page_dirty(m);
 		}
 		if (m && opte)
 			vm_page_aflag_set(m, PGA_REFERENCED);
 	} else {
 		/*
 		 * Need to do page referenced emulation.
 		 */
 		npte |= L2_TYPE_INV;
 	}
 	
 	if (prot & VM_PROT_WRITE) {
 		npte |= L2_S_PROT_W;
 		if (m != NULL &&
 		    (m->oflags & VPO_UNMANAGED) == 0)
 			vm_page_aflag_set(m, PGA_WRITEABLE);
 	}
 	if (m->md.pv_memattr != VM_MEMATTR_UNCACHEABLE)
 		npte |= pte_l2_s_cache_mode;
 	if (m && m == opg) {
 		/*
 		 * We're changing the attrs of an existing mapping.
 		 */
 		oflags = pmap_modify_pv(m, pmap, va,
 		    PVF_WRITE | PVF_EXEC | PVF_WIRED |
 		    PVF_MOD | PVF_REF, nflags);
 		
 		/*
 		 * We may need to flush the cache if we're
 		 * doing rw-ro...
 		 */
 		if (pmap_is_current(pmap) &&
 		    (oflags & PVF_NC) == 0 &&
 		    (opte & L2_S_PROT_W) != 0 &&
 		    (prot & VM_PROT_WRITE) == 0 &&
 		    (opte & L2_TYPE_MASK) != L2_TYPE_INV) {
 			cpu_dcache_wb_range(va, PAGE_SIZE);
 			cpu_l2cache_wb_range(va, PAGE_SIZE);
 		}
 	} else {
 		/*
 		 * New mapping, or changing the backing page
 		 * of an existing mapping.
 		 */
 		if (opg) {
 			/*
 			 * Replacing an existing mapping with a new one.
 			 * It is part of our managed memory so we
 			 * must remove it from the PV list
 			 */
 			if ((pve = pmap_remove_pv(opg, pmap, va))) {
 
 			/* note for patch: the oflags/invalidation was moved
 			 * because PG_FICTITIOUS pages could free the pve
 			 */
 			    oflags = pve->pv_flags;
 			/*
 			 * If the old mapping was valid (ref/mod
 			 * emulation creates 'invalid' mappings
 			 * initially) then make sure to frob
 			 * the cache.
 			 */
 			    if ((oflags & PVF_NC) == 0 && l2pte_valid(opte)) {
 				if (PV_BEEN_EXECD(oflags)) {
 					pmap_idcache_wbinv_range(pmap, va,
 					    PAGE_SIZE);
 				} else
 					if (PV_BEEN_REFD(oflags)) {
 						pmap_dcache_wb_range(pmap, va,
 						    PAGE_SIZE, TRUE,
 						    (oflags & PVF_WRITE) == 0);
 					}
 			    }
 
 			/* free/allocate a pv_entry for UNMANAGED pages if
 			 * this physical page is not/is already mapped.
 			 */
 
 			    if (m && (m->oflags & VPO_UNMANAGED) &&
 				  !m->md.pv_kva &&
 				 TAILQ_EMPTY(&m->md.pv_list)) {
 				pmap_free_pv_entry(pve);
 				pve = NULL;
 			    }
 			} else if (m &&
 				 (!(m->oflags & VPO_UNMANAGED) || m->md.pv_kva ||
 				  !TAILQ_EMPTY(&m->md.pv_list)))
 				pve = pmap_get_pv_entry();
 		} else if (m &&
 			   (!(m->oflags & VPO_UNMANAGED) || m->md.pv_kva ||
 			   !TAILQ_EMPTY(&m->md.pv_list)))
 			pve = pmap_get_pv_entry();
 
 		if (m) {
 			if ((m->oflags & VPO_UNMANAGED)) {
 				if (!TAILQ_EMPTY(&m->md.pv_list) ||
 				    m->md.pv_kva) {
 					KASSERT(pve != NULL, ("No pv"));
 					nflags |= PVF_UNMAN;
 					pmap_enter_pv(m, pve, pmap, va, nflags);
 				} else
 					m->md.pv_kva = va;
 			} else {
 				KASSERT(va < kmi.clean_sva ||
 				    va >= kmi.clean_eva,
 		("pmap_enter: managed mapping within the clean submap"));
  				KASSERT(pve != NULL, ("No pv"));
  				pmap_enter_pv(m, pve, pmap, va, nflags);
 			}
 		}
 	}
 	/*
 	 * Make sure userland mappings get the right permissions
 	 */
 	if (pmap != pmap_kernel() && va != vector_page) {
 		npte |= L2_S_PROT_U;
 	}
 
 	/*
 	 * Keep the stats up to date
 	 */
 	if (opte == 0) {
 		l2b->l2b_occupancy++;
 		pmap->pm_stats.resident_count++;
 	}
 
 	/*
 	 * If this is just a wiring change, the two PTEs will be
 	 * identical, so there's no need to update the page table.
 	 */
 	if (npte != opte) {
 		boolean_t is_cached = pmap_is_current(pmap);
 
 		*ptep = npte;
 		if (is_cached) {
 			/*
 			 * We only need to frob the cache/tlb if this pmap
 			 * is current
 			 */
 			PTE_SYNC(ptep);
 			if (L1_IDX(va) != L1_IDX(vector_page) &&
 			    l2pte_valid(npte)) {
 				/*
 				 * This mapping is likely to be accessed as
 				 * soon as we return to userland. Fix up the
 				 * L1 entry to avoid taking another
 				 * page/domain fault.
 				 */
 				pd_entry_t *pl1pd, l1pd;
 
 				pl1pd = &pmap->pm_l1->l1_kva[L1_IDX(va)];
 				l1pd = l2b->l2b_phys | L1_C_DOM(pmap->pm_domain) |
 				    L1_C_PROTO;
 				if (*pl1pd != l1pd) {
 					*pl1pd = l1pd;
 					PTE_SYNC(pl1pd);
 				}
 			}
 		}
 
 		if (PV_BEEN_EXECD(oflags))
 			pmap_tlb_flushID_SE(pmap, va);
 		else if (PV_BEEN_REFD(oflags))
 			pmap_tlb_flushD_SE(pmap, va);
 
 
 		if (m)
 			pmap_fix_cache(m, pmap, va);
 	}
 }
 
 /*
  * Maps a sequence of resident pages belonging to the same object.
  * The sequence begins with the given page m_start.  This page is
  * mapped at the given virtual address start.  Each subsequent page is
  * mapped at a virtual address that is offset from start by the same
  * amount as the page is offset from m_start within the object.  The
  * last page in the sequence is the page with the largest offset from
  * m_start that can be mapped at a virtual address less than the given
  * virtual address end.  Not every virtual page between start and end
  * is mapped; only those for which a resident page exists with the
  * corresponding offset from m_start are mapped.
  */
 void
 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
     vm_page_t m_start, vm_prot_t prot)
 {
 	vm_page_t m;
 	vm_pindex_t diff, psize;
 
 	psize = atop(end - start);
 	m = m_start;
 	rw_wlock(&pvh_global_lock);
 	PMAP_LOCK(pmap);
 	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
 		pmap_enter_locked(pmap, start + ptoa(diff), m, prot &
 		    (VM_PROT_READ | VM_PROT_EXECUTE), FALSE, M_NOWAIT);
 		m = TAILQ_NEXT(m, listq);
 	}
 	rw_wunlock(&pvh_global_lock);
  	PMAP_UNLOCK(pmap);
 }
 
 /*
  * this code makes some *MAJOR* assumptions:
  * 1. Current pmap & pmap exists.
  * 2. Not wired.
  * 3. Read access.
  * 4. No page table pages.
  * but is *MUCH* faster than pmap_enter...
  */
 
 void
 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
 {
 
 	rw_wlock(&pvh_global_lock);
  	PMAP_LOCK(pmap);
 	pmap_enter_locked(pmap, va, m, prot & (VM_PROT_READ | VM_PROT_EXECUTE),
 	    FALSE, M_NOWAIT);
 	rw_wunlock(&pvh_global_lock);
  	PMAP_UNLOCK(pmap);
 }
 
 /*
  *	Routine:	pmap_change_wiring
  *	Function:	Change the wiring attribute for a map/virtual-address
  *			pair.
  *	In/out conditions:
  *			The mapping must already exist in the pmap.
  */
 void
 pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired)
 {
 	struct l2_bucket *l2b;
 	pt_entry_t *ptep, pte;
 	vm_page_t pg;
 
 	rw_wlock(&pvh_global_lock);
  	PMAP_LOCK(pmap);
 	l2b = pmap_get_l2_bucket(pmap, va);
 	KASSERT(l2b, ("No l2b bucket in pmap_change_wiring"));
 	ptep = &l2b->l2b_kva[l2pte_index(va)];
 	pte = *ptep;
 	pg = PHYS_TO_VM_PAGE(l2pte_pa(pte));
 	if (pg)
 		pmap_modify_pv(pg, pmap, va, PVF_WIRED, wired ? PVF_WIRED : 0);
 	rw_wunlock(&pvh_global_lock);
  	PMAP_UNLOCK(pmap);
 }
 
 
 /*
  *	Copy the range specified by src_addr/len
  *	from the source map to the range dst_addr/len
  *	in the destination map.
  *
  *	This routine is only advisory and need not do anything.
  */
 void
 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr,
     vm_size_t len, vm_offset_t src_addr)
 {
 }
 
 
 /*
  *	Routine:	pmap_extract
  *	Function:
  *		Extract the physical page address associated
  *		with the given map/virtual_address pair.
  */
 vm_paddr_t
 pmap_extract(pmap_t pmap, vm_offset_t va)
 {
 	vm_paddr_t pa;
 
 	PMAP_LOCK(pmap);
 	pa = pmap_extract_locked(pmap, va);
 	PMAP_UNLOCK(pmap);
 	return (pa);
 }
 
 static vm_paddr_t
 pmap_extract_locked(pmap_t pmap, vm_offset_t va)
 {
 	struct l2_dtable *l2;
 	pd_entry_t l1pd;
 	pt_entry_t *ptep, pte;
 	vm_paddr_t pa;
 	u_int l1idx;
 
 	if (pmap != kernel_pmap)
 		PMAP_ASSERT_LOCKED(pmap);
 	l1idx = L1_IDX(va);
 	l1pd = pmap->pm_l1->l1_kva[l1idx];
 	if (l1pte_section_p(l1pd)) {
 		/*
 		 * These should only happen for the kernel pmap.
 		 */
 		KASSERT(pmap == kernel_pmap, ("unexpected section"));
 		/* XXX: what to do about the bits > 32 ? */
 		if (l1pd & L1_S_SUPERSEC)
 			pa = (l1pd & L1_SUP_FRAME) | (va & L1_SUP_OFFSET);
 		else
 			pa = (l1pd & L1_S_FRAME) | (va & L1_S_OFFSET);
 	} else {
 		/*
 		 * Note that we can't rely on the validity of the L1
 		 * descriptor as an indication that a mapping exists.
 		 * We have to look it up in the L2 dtable.
 		 */
 		l2 = pmap->pm_l2[L2_IDX(l1idx)];
 		if (l2 == NULL ||
 		    (ptep = l2->l2_bucket[L2_BUCKET(l1idx)].l2b_kva) == NULL)
 			return (0);
 		pte = ptep[l2pte_index(va)];
 		if (pte == 0)
 			return (0);
 		switch (pte & L2_TYPE_MASK) {
 		case L2_TYPE_L:
 			pa = (pte & L2_L_FRAME) | (va & L2_L_OFFSET);
 			break;
 		default:
 			pa = (pte & L2_S_FRAME) | (va & L2_S_OFFSET);
 			break;
 		}
 	}
 	return (pa);
 }
 
 /*
  * Atomically extract and hold the physical page with the given
  * pmap and virtual address pair if that mapping permits the given
  * protection.
  *
  */
 vm_page_t
 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
 {
 	struct l2_dtable *l2;
 	pd_entry_t l1pd;
 	pt_entry_t *ptep, pte;
 	vm_paddr_t pa, paddr;
 	vm_page_t m = NULL;
 	u_int l1idx;
 	l1idx = L1_IDX(va);
 	paddr = 0;
 
  	PMAP_LOCK(pmap);
 retry:
 	l1pd = pmap->pm_l1->l1_kva[l1idx];
 	if (l1pte_section_p(l1pd)) {
 		/*
 		 * These should only happen for pmap_kernel()
 		 */
 		KASSERT(pmap == pmap_kernel(), ("huh"));
 		/* XXX: what to do about the bits > 32 ? */
 		if (l1pd & L1_S_SUPERSEC)
 			pa = (l1pd & L1_SUP_FRAME) | (va & L1_SUP_OFFSET);
 		else
 			pa = (l1pd & L1_S_FRAME) | (va & L1_S_OFFSET);
 		if (vm_page_pa_tryrelock(pmap, pa & PG_FRAME, &paddr))
 			goto retry;
 		if (l1pd & L1_S_PROT_W || (prot & VM_PROT_WRITE) == 0) {
 			m = PHYS_TO_VM_PAGE(pa);
 			vm_page_hold(m);
 		}
 			
 	} else {
 		/*
 		 * Note that we can't rely on the validity of the L1
 		 * descriptor as an indication that a mapping exists.
 		 * We have to look it up in the L2 dtable.
 		 */
 		l2 = pmap->pm_l2[L2_IDX(l1idx)];
 
 		if (l2 == NULL ||
 		    (ptep = l2->l2_bucket[L2_BUCKET(l1idx)].l2b_kva) == NULL) {
 		 	PMAP_UNLOCK(pmap);
 			return (NULL);
 		}
 
 		ptep = &ptep[l2pte_index(va)];
 		pte = *ptep;
 
 		if (pte == 0) {
 		 	PMAP_UNLOCK(pmap);
 			return (NULL);
 		}
 		if (pte & L2_S_PROT_W || (prot & VM_PROT_WRITE) == 0) {
 			switch (pte & L2_TYPE_MASK) {
 			case L2_TYPE_L:
 				pa = (pte & L2_L_FRAME) | (va & L2_L_OFFSET);
 				break;
 				
 			default:
 				pa = (pte & L2_S_FRAME) | (va & L2_S_OFFSET);
 				break;
 			}
 			if (vm_page_pa_tryrelock(pmap, pa & PG_FRAME, &paddr))
 				goto retry;		
 			m = PHYS_TO_VM_PAGE(pa);
 			vm_page_hold(m);
 		}
 	}
 
  	PMAP_UNLOCK(pmap);
 	PA_UNLOCK_COND(paddr);
 	return (m);
 }
 
 /*
  * Initialize a preallocated and zeroed pmap structure,
  * such as one in a vmspace structure.
  */
 
 int
 pmap_pinit(pmap_t pmap)
 {
 	PDEBUG(1, printf("pmap_pinit: pmap = %08x\n", (uint32_t) pmap));
 	
 	PMAP_LOCK_INIT(pmap);
 	pmap_alloc_l1(pmap);
 	bzero(pmap->pm_l2, sizeof(pmap->pm_l2));
 
 	CPU_ZERO(&pmap->pm_active);
 		
 	TAILQ_INIT(&pmap->pm_pvlist);
 	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
 	pmap->pm_stats.resident_count = 1;
 	if (vector_page < KERNBASE) {
 		pmap_enter(pmap, vector_page,
 		    VM_PROT_READ, PHYS_TO_VM_PAGE(systempage.pv_pa),
 		    VM_PROT_READ, 1);
 	}
 	return (1);
 }
 
 
 /***************************************************
  * page management routines.
  ***************************************************/
 
 
 static void
 pmap_free_pv_entry(pv_entry_t pv)
 {
 	pv_entry_count--;
 	uma_zfree(pvzone, pv);
 }
 
 
 /*
  * get a new pv_entry, allocating a block from the system
  * when needed.
  * the memory allocation is performed bypassing the malloc code
  * because of the possibility of allocations at interrupt time.
  */
 static pv_entry_t
 pmap_get_pv_entry(void)
 {
 	pv_entry_t ret_value;
 	
 	pv_entry_count++;
 	if (pv_entry_count > pv_entry_high_water)
 		pagedaemon_wakeup();
 	ret_value = uma_zalloc(pvzone, M_NOWAIT);
 	return ret_value;
 }
 
 /*
  *	Remove the given range of addresses from the specified map.
  *
  *	It is assumed that the start and end are properly
  *	rounded to the page size.
  */
 #define	PMAP_REMOVE_CLEAN_LIST_SIZE	3
 void
 pmap_remove(pmap_t pm, vm_offset_t sva, vm_offset_t eva)
 {
 	struct l2_bucket *l2b;
 	vm_offset_t next_bucket;
 	pt_entry_t *ptep;
 	u_int total;
 	u_int mappings, is_exec, is_refd;
 	int flushall = 0;
 
 
 	/*
 	 * we lock in the pmap => pv_head direction
 	 */
 
 	rw_wlock(&pvh_global_lock);
 	PMAP_LOCK(pm);
 	total = 0;
 	while (sva < eva) {
 		/*
 		 * Do one L2 bucket's worth at a time.
 		 */
 		next_bucket = L2_NEXT_BUCKET(sva);
 		if (next_bucket > eva)
 			next_bucket = eva;
 
 		l2b = pmap_get_l2_bucket(pm, sva);
 		if (l2b == NULL) {
 			sva = next_bucket;
 			continue;
 		}
 
 		ptep = &l2b->l2b_kva[l2pte_index(sva)];
 		mappings = 0;
 
 		while (sva < next_bucket) {
 			struct vm_page *pg;
 			pt_entry_t pte;
 			vm_paddr_t pa;
 
 			pte = *ptep;
 
 			if (pte == 0) {
 				/*
 				 * Nothing here, move along
 				 */
 				sva += PAGE_SIZE;
 				ptep++;
 				continue;
 			}
 
 			pm->pm_stats.resident_count--;
 			pa = l2pte_pa(pte);
 			is_exec = 0;
 			is_refd = 1;
 
 			/*
 			 * Update flags. In a number of circumstances,
 			 * we could cluster a lot of these and do a
 			 * number of sequential pages in one go.
 			 */
 			if ((pg = PHYS_TO_VM_PAGE(pa)) != NULL) {
 				struct pv_entry *pve;
 
 				pve = pmap_remove_pv(pg, pm, sva);
 				if (pve) {
 					is_exec = PV_BEEN_EXECD(pve->pv_flags);
 					is_refd = PV_BEEN_REFD(pve->pv_flags);
 					pmap_free_pv_entry(pve);
 				}
 			}
 
 			if (l2pte_valid(pte) && pmap_is_current(pm)) {
 				if (total < PMAP_REMOVE_CLEAN_LIST_SIZE) {
 					total++;
 			   		if (is_exec) {
         					cpu_idcache_wbinv_range(sva,
 						    PAGE_SIZE);
 						cpu_l2cache_wbinv_range(sva,
 						    PAGE_SIZE);
 						cpu_tlb_flushID_SE(sva);
 			   		} else if (is_refd) {
 						cpu_dcache_wbinv_range(sva,
 						    PAGE_SIZE);
 						cpu_l2cache_wbinv_range(sva,
 						    PAGE_SIZE);
 						cpu_tlb_flushD_SE(sva);
 					}
 				} else if (total == PMAP_REMOVE_CLEAN_LIST_SIZE) {
 					/* flushall will also only get set for
 					 * for a current pmap
 					 */
 					cpu_idcache_wbinv_all();
 					cpu_l2cache_wbinv_all();
 					flushall = 1;
 					total++;
 				}
 			}
 			*ptep = 0;
 			PTE_SYNC(ptep);
 
 			sva += PAGE_SIZE;
 			ptep++;
 			mappings++;
 		}
 
 		pmap_free_l2_bucket(pm, l2b, mappings);
 	}
 
 	rw_wunlock(&pvh_global_lock);
 	if (flushall)
 		cpu_tlb_flushID();
  	PMAP_UNLOCK(pm);
 }
 
 /*
  * pmap_zero_page()
  *
  * Zero a given physical page by mapping it at a page hook point.
  * In doing the zero page op, the page we zero is mapped cachable, as with
  * StrongARM accesses to non-cached pages are non-burst making writing
  * _any_ bulk data very slow.
  */
 #if (ARM_MMU_GENERIC + ARM_MMU_SA1) != 0 || defined(CPU_XSCALE_CORE3)
 void
 pmap_zero_page_generic(vm_paddr_t phys, int off, int size)
 {
 #ifdef ARM_USE_SMALL_ALLOC
 	char *dstpg;
 #endif
 
 	if (_arm_bzero && size >= _min_bzero_size &&
 	    _arm_bzero((void *)(phys + off), size, IS_PHYSICAL) == 0)
 		return;
 
 #ifdef ARM_USE_SMALL_ALLOC
 	dstpg = (char *)arm_ptovirt(phys);
 	if (off || size != PAGE_SIZE) {
 		bzero(dstpg + off, size);
 		cpu_dcache_wbinv_range((vm_offset_t)(dstpg + off), size);
 		cpu_l2cache_wbinv_range((vm_offset_t)(dstpg + off), size);
 	} else {
 		bzero_page((vm_offset_t)dstpg);
 		cpu_dcache_wbinv_range((vm_offset_t)dstpg, PAGE_SIZE);
 		cpu_l2cache_wbinv_range((vm_offset_t)dstpg, PAGE_SIZE);
 	}
 #else
 
 	mtx_lock(&cmtx);
 	/*
 	 * Hook in the page, zero it, invalidate the TLB as needed.
 	 *
 	 * Note the temporary zero-page mapping must be a non-cached page in
 	 * order to work without corruption when write-allocate is enabled.
 	 */
 	*cdst_pte = L2_S_PROTO | phys | L2_S_PROT(PTE_KERNEL, VM_PROT_WRITE);
 	PTE_SYNC(cdst_pte);
 	cpu_tlb_flushD_SE(cdstp);
 	cpu_cpwait();
 	if (off || size != PAGE_SIZE)
 		bzero((void *)(cdstp + off), size);
 	else
 		bzero_page(cdstp);
 
 	mtx_unlock(&cmtx);
 #endif
 }
 #endif /* (ARM_MMU_GENERIC + ARM_MMU_SA1) != 0 */
 
 #if ARM_MMU_XSCALE == 1
 void
 pmap_zero_page_xscale(vm_paddr_t phys, int off, int size)
 {
 #ifdef ARM_USE_SMALL_ALLOC
 	char *dstpg;
 #endif
 
 	if (_arm_bzero && size >= _min_bzero_size &&
 	    _arm_bzero((void *)(phys + off), size, IS_PHYSICAL) == 0)
 		return;
 #ifdef ARM_USE_SMALL_ALLOC
 	dstpg = (char *)arm_ptovirt(phys);
 	if (off || size != PAGE_SIZE) {
 		bzero(dstpg + off, size);
 		cpu_dcache_wbinv_range((vm_offset_t)(dstpg + off), size);
 	} else {
 		bzero_page((vm_offset_t)dstpg);
 		cpu_dcache_wbinv_range((vm_offset_t)dstpg, PAGE_SIZE);
 	}
 #else
 	mtx_lock(&cmtx);
 	/*
 	 * Hook in the page, zero it, and purge the cache for that
 	 * zeroed page. Invalidate the TLB as needed.
 	 */
 	*cdst_pte = L2_S_PROTO | phys |
 	    L2_S_PROT(PTE_KERNEL, VM_PROT_WRITE) |
 	    L2_C | L2_XSCALE_T_TEX(TEX_XSCALE_X);	/* mini-data */
 	PTE_SYNC(cdst_pte);
 	cpu_tlb_flushD_SE(cdstp);
 	cpu_cpwait();
 	if (off || size != PAGE_SIZE)
 		bzero((void *)(cdstp + off), size);
 	else
 		bzero_page(cdstp);
 	mtx_unlock(&cmtx);
 	xscale_cache_clean_minidata();
 #endif
 }
 
 /*
  * Change the PTEs for the specified kernel mappings such that they
  * will use the mini data cache instead of the main data cache.
  */
 void
 pmap_use_minicache(vm_offset_t va, vm_size_t size)
 {
 	struct l2_bucket *l2b;
 	pt_entry_t *ptep, *sptep, pte;
 	vm_offset_t next_bucket, eva;
 
 #if (ARM_NMMUS > 1) || defined(CPU_XSCALE_CORE3)
 	if (xscale_use_minidata == 0)
 		return;
 #endif
 
 	eva = va + size;
 
 	while (va < eva) {
 		next_bucket = L2_NEXT_BUCKET(va);
 		if (next_bucket > eva)
 			next_bucket = eva;
 
 		l2b = pmap_get_l2_bucket(pmap_kernel(), va);
 
 		sptep = ptep = &l2b->l2b_kva[l2pte_index(va)];
 
 		while (va < next_bucket) {
 			pte = *ptep;
 			if (!l2pte_minidata(pte)) {
 				cpu_dcache_wbinv_range(va, PAGE_SIZE);
 				cpu_tlb_flushD_SE(va);
 				*ptep = pte & ~L2_B;
 			}
 			ptep++;
 			va += PAGE_SIZE;
 		}
 		PTE_SYNC_RANGE(sptep, (u_int)(ptep - sptep));
 	}
 	cpu_cpwait();
 }
 #endif /* ARM_MMU_XSCALE == 1 */
 
 /*
  *	pmap_zero_page zeros the specified hardware page by mapping
  *	the page into KVM and using bzero to clear its contents.
  */
 void
 pmap_zero_page(vm_page_t m)
 {
 	pmap_zero_page_func(VM_PAGE_TO_PHYS(m), 0, PAGE_SIZE);
 }
 
 
 /*
  *	pmap_zero_page_area zeros the specified hardware page by mapping
  *	the page into KVM and using bzero to clear its contents.
  *
  *	off and size may not cover an area beyond a single hardware page.
  */
 void
 pmap_zero_page_area(vm_page_t m, int off, int size)
 {
 
 	pmap_zero_page_func(VM_PAGE_TO_PHYS(m), off, size);
 }
 
 
 /*
  *	pmap_zero_page_idle zeros the specified hardware page by mapping
  *	the page into KVM and using bzero to clear its contents.  This
  *	is intended to be called from the vm_pagezero process only and
  *	outside of Giant.
  */
 void
 pmap_zero_page_idle(vm_page_t m)
 {
 
 	pmap_zero_page(m);
 }
 
 #if 0
 /*
  * pmap_clean_page()
  *
  * This is a local function used to work out the best strategy to clean
  * a single page referenced by its entry in the PV table. It should be used by
  * pmap_copy_page, pmap_zero page and maybe some others later on.
  *
  * Its policy is effectively:
  *  o If there are no mappings, we don't bother doing anything with the cache.
  *  o If there is one mapping, we clean just that page.
  *  o If there are multiple mappings, we clean the entire cache.
  *
  * So that some functions can be further optimised, it returns 0 if it didn't
  * clean the entire cache, or 1 if it did.
  *
  * XXX One bug in this routine is that if the pv_entry has a single page
  * mapped at 0x00000000 a whole cache clean will be performed rather than
  * just the 1 page. Since this should not occur in everyday use and if it does
  * it will just result in not the most efficient clean for the page.
  *
  * We don't yet use this function but may want to.
  */
 static int
 pmap_clean_page(struct pv_entry *pv, boolean_t is_src)
 {
 	pmap_t pm, pm_to_clean = NULL;
 	struct pv_entry *npv;
 	u_int cache_needs_cleaning = 0;
 	u_int flags = 0;
 	vm_offset_t page_to_clean = 0;
 
 	if (pv == NULL) {
 		/* nothing mapped in so nothing to flush */
 		return (0);
 	}
 
 	/*
 	 * Since we flush the cache each time we change to a different
 	 * user vmspace, we only need to flush the page if it is in the
 	 * current pmap.
 	 */
 	if (curthread)
 		pm = vmspace_pmap(curproc->p_vmspace);
 	else
 		pm = pmap_kernel();
 
 	for (npv = pv; npv; npv = TAILQ_NEXT(npv, pv_list)) {
 		if (npv->pv_pmap == pmap_kernel() || npv->pv_pmap == pm) {
 			flags |= npv->pv_flags;
 			/*
 			 * The page is mapped non-cacheable in
 			 * this map.  No need to flush the cache.
 			 */
 			if (npv->pv_flags & PVF_NC) {
 #ifdef DIAGNOSTIC
 				if (cache_needs_cleaning)
 					panic("pmap_clean_page: "
 					    "cache inconsistency");
 #endif
 				break;
 			} else if (is_src && (npv->pv_flags & PVF_WRITE) == 0)
 				continue;
 			if (cache_needs_cleaning) {
 				page_to_clean = 0;
 				break;
 			} else {
 				page_to_clean = npv->pv_va;
 				pm_to_clean = npv->pv_pmap;
 			}
 			cache_needs_cleaning = 1;
 		}
 	}
 	if (page_to_clean) {
 		if (PV_BEEN_EXECD(flags))
 			pmap_idcache_wbinv_range(pm_to_clean, page_to_clean,
 			    PAGE_SIZE);
 		else
 			pmap_dcache_wb_range(pm_to_clean, page_to_clean,
 			    PAGE_SIZE, !is_src, (flags & PVF_WRITE) == 0);
 	} else if (cache_needs_cleaning) {
 		if (PV_BEEN_EXECD(flags))
 			pmap_idcache_wbinv_all(pm);
 		else
 			pmap_dcache_wbinv_all(pm);
 		return (1);
 	}
 	return (0);
 }
 #endif
 
 /*
  *	pmap_copy_page copies the specified (machine independent)
  *	page by mapping the page into virtual memory and using
  *	bcopy to copy the page, one machine dependent page at a
  *	time.
  */
 
 /*
  * pmap_copy_page()
  *
  * Copy one physical page into another, by mapping the pages into
  * hook points. The same comment regarding cachability as in
  * pmap_zero_page also applies here.
  */
 #if  (ARM_MMU_GENERIC + ARM_MMU_SA1) != 0 || defined (CPU_XSCALE_CORE3)
 void
 pmap_copy_page_generic(vm_paddr_t src, vm_paddr_t dst)
 {
 #if 0
 	struct vm_page *src_pg = PHYS_TO_VM_PAGE(src);
 #endif
 
 	/*
 	 * Clean the source page.  Hold the source page's lock for
 	 * the duration of the copy so that no other mappings can
 	 * be created while we have a potentially aliased mapping.
 	 */
 #if 0
 	/*
 	 * XXX: Not needed while we call cpu_dcache_wbinv_all() in
 	 * pmap_copy_page().
 	 */
 	(void) pmap_clean_page(TAILQ_FIRST(&src_pg->md.pv_list), TRUE);
 #endif
 	/*
 	 * Map the pages into the page hook points, copy them, and purge
 	 * the cache for the appropriate page. Invalidate the TLB
 	 * as required.
 	 */
 	mtx_lock(&cmtx);
 	*csrc_pte = L2_S_PROTO | src |
 	    L2_S_PROT(PTE_KERNEL, VM_PROT_READ) | pte_l2_s_cache_mode;
 	PTE_SYNC(csrc_pte);
 	*cdst_pte = L2_S_PROTO | dst |
 	    L2_S_PROT(PTE_KERNEL, VM_PROT_WRITE) | pte_l2_s_cache_mode;
 	PTE_SYNC(cdst_pte);
 	cpu_tlb_flushD_SE(csrcp);
 	cpu_tlb_flushD_SE(cdstp);
 	cpu_cpwait();
 	bcopy_page(csrcp, cdstp);
 	mtx_unlock(&cmtx);
 	cpu_dcache_inv_range(csrcp, PAGE_SIZE);
 	cpu_dcache_wbinv_range(cdstp, PAGE_SIZE);
 	cpu_l2cache_inv_range(csrcp, PAGE_SIZE);
 	cpu_l2cache_wbinv_range(cdstp, PAGE_SIZE);
 }
 #endif /* (ARM_MMU_GENERIC + ARM_MMU_SA1) != 0 */
 
 #if ARM_MMU_XSCALE == 1
 void
 pmap_copy_page_xscale(vm_paddr_t src, vm_paddr_t dst)
 {
 #if 0
 	/* XXX: Only needed for pmap_clean_page(), which is commented out. */
 	struct vm_page *src_pg = PHYS_TO_VM_PAGE(src);
 #endif
 
 	/*
 	 * Clean the source page.  Hold the source page's lock for
 	 * the duration of the copy so that no other mappings can
 	 * be created while we have a potentially aliased mapping.
 	 */
 #if 0
 	/*
 	 * XXX: Not needed while we call cpu_dcache_wbinv_all() in
 	 * pmap_copy_page().
 	 */
 	(void) pmap_clean_page(TAILQ_FIRST(&src_pg->md.pv_list), TRUE);
 #endif
 	/*
 	 * Map the pages into the page hook points, copy them, and purge
 	 * the cache for the appropriate page. Invalidate the TLB
 	 * as required.
 	 */
 	mtx_lock(&cmtx);
 	*csrc_pte = L2_S_PROTO | src |
 	    L2_S_PROT(PTE_KERNEL, VM_PROT_READ) |
 	    L2_C | L2_XSCALE_T_TEX(TEX_XSCALE_X);	/* mini-data */
 	PTE_SYNC(csrc_pte);
 	*cdst_pte = L2_S_PROTO | dst |
 	    L2_S_PROT(PTE_KERNEL, VM_PROT_WRITE) |
 	    L2_C | L2_XSCALE_T_TEX(TEX_XSCALE_X);	/* mini-data */
 	PTE_SYNC(cdst_pte);
 	cpu_tlb_flushD_SE(csrcp);
 	cpu_tlb_flushD_SE(cdstp);
 	cpu_cpwait();
 	bcopy_page(csrcp, cdstp);
 	mtx_unlock(&cmtx);
 	xscale_cache_clean_minidata();
 }
 #endif /* ARM_MMU_XSCALE == 1 */
 
 void
 pmap_copy_page(vm_page_t src, vm_page_t dst)
 {
 #ifdef ARM_USE_SMALL_ALLOC
 	vm_offset_t srcpg, dstpg;
 #endif
 
 	cpu_dcache_wbinv_all();
 	cpu_l2cache_wbinv_all();
 	if (_arm_memcpy && PAGE_SIZE >= _min_memcpy_size &&
 	    _arm_memcpy((void *)VM_PAGE_TO_PHYS(dst),
 	    (void *)VM_PAGE_TO_PHYS(src), PAGE_SIZE, IS_PHYSICAL) == 0)
 		return;
 #ifdef ARM_USE_SMALL_ALLOC
 	srcpg = arm_ptovirt(VM_PAGE_TO_PHYS(src));
 	dstpg = arm_ptovirt(VM_PAGE_TO_PHYS(dst));
 	bcopy_page(srcpg, dstpg);
 	cpu_dcache_wbinv_range(dstpg, PAGE_SIZE);
 	cpu_l2cache_wbinv_range(dstpg, PAGE_SIZE);
 #else
 	pmap_copy_page_func(VM_PAGE_TO_PHYS(src), VM_PAGE_TO_PHYS(dst));
 #endif
 }
 
 
 
 
 /*
  * this routine returns true if a physical page resides
  * in the given pmap.
  */
 boolean_t
 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
 {
 	pv_entry_t pv;
 	int loops = 0;
 	boolean_t rv;
 	
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_page_exists_quick: page %p is not managed", m));
 	rv = FALSE;
 	rw_wlock(&pvh_global_lock);
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
 	    	if (pv->pv_pmap == pmap) {
 			rv = TRUE;
 			break;
 	    	}
 		loops++;
 		if (loops >= 16)
 			break;
 	}
 	rw_wunlock(&pvh_global_lock);
 	return (rv);
 }
 
 /*
  *	pmap_page_wired_mappings:
  *
  *	Return the number of managed mappings to the given physical page
  *	that are wired.
  */
 int
 pmap_page_wired_mappings(vm_page_t m)
 {
 	pv_entry_t pv;
 	int count;
 
 	count = 0;
 	if ((m->oflags & VPO_UNMANAGED) != 0)
 		return (count);
 	rw_wlock(&pvh_global_lock);
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list)
 		if ((pv->pv_flags & PVF_WIRED) != 0)
 			count++;
 	rw_wunlock(&pvh_global_lock);
 	return (count);
 }
 
 /*
  *	pmap_ts_referenced:
  *
  *	Return the count of reference bits for a page, clearing all of them.
  */
 int
 pmap_ts_referenced(vm_page_t m)
 {
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_ts_referenced: page %p is not managed", m));
 	return (pmap_clearbit(m, PVF_REF));
 }
 
 
 boolean_t
 pmap_is_modified(vm_page_t m)
 {
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_is_modified: page %p is not managed", m));
 	if (m->md.pvh_attrs & PVF_MOD)
 		return (TRUE);
 	
 	return(FALSE);
 }
 
 
 /*
  *	Clear the modify bits on the specified physical page.
  */
 void
 pmap_clear_modify(vm_page_t m)
 {
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_clear_modify: page %p is not managed", m));
 	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
 	KASSERT((m->oflags & VPO_BUSY) == 0,
 	    ("pmap_clear_modify: page %p is busy", m));
 
 	/*
 	 * If the page is not PGA_WRITEABLE, then no mappings can be modified.
 	 * If the object containing the page is locked and the page is not
 	 * VPO_BUSY, then PGA_WRITEABLE cannot be concurrently set.
 	 */
 	if ((m->aflags & PGA_WRITEABLE) == 0)
 		return;
 	if (m->md.pvh_attrs & PVF_MOD)
 		pmap_clearbit(m, PVF_MOD);
 }
 
 
 /*
  *	pmap_is_referenced:
  *
  *	Return whether or not the specified physical page was referenced
  *	in any physical maps.
  */
 boolean_t
 pmap_is_referenced(vm_page_t m)
 {
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_is_referenced: page %p is not managed", m));
 	return ((m->md.pvh_attrs & PVF_REF) != 0);
 }
 
 /*
  *	pmap_clear_reference:
  *
  *	Clear the reference bit on the specified physical page.
  */
 void
 pmap_clear_reference(vm_page_t m)
 {
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_clear_reference: page %p is not managed", m));
 	if (m->md.pvh_attrs & PVF_REF)
 		pmap_clearbit(m, PVF_REF);
 }
 
 
 /*
  * Clear the write and modified bits in each of the given page's mappings.
  */
 void
 pmap_remove_write(vm_page_t m)
 {
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_remove_write: page %p is not managed", m));
 
 	/*
 	 * If the page is not VPO_BUSY, then PGA_WRITEABLE cannot be set by
 	 * another thread while the object is locked.  Thus, if PGA_WRITEABLE
 	 * is clear, no page table entries need updating.
 	 */
 	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
 	if ((m->oflags & VPO_BUSY) != 0 ||
 	    (m->aflags & PGA_WRITEABLE) != 0)
 		pmap_clearbit(m, PVF_WRITE);
 }
 
 
 /*
  * perform the pmap work for mincore
  */
 int
 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa)
 {
 	struct l2_bucket *l2b;
 	pt_entry_t *ptep, pte;
 	vm_paddr_t pa;
 	vm_page_t m;
 	int val;
 	boolean_t managed;
 
 	PMAP_LOCK(pmap);
 retry:
 	l2b = pmap_get_l2_bucket(pmap, addr);
         if (l2b == NULL) {
                 val = 0;
                 goto out;
         }
 	ptep = &l2b->l2b_kva[l2pte_index(addr)];
 	pte = *ptep;
 	if (!l2pte_valid(pte)) {
 		val = 0;
 		goto out;
 	}
 	val = MINCORE_INCORE;
 	if (pte & L2_S_PROT_W)
 		val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
         managed = false;
 	pa = l2pte_pa(pte);
         m = PHYS_TO_VM_PAGE(pa);
         if (m != NULL && !(m->oflags & VPO_UNMANAGED))
                 managed = true;
 	if (managed) {
 		/*
 		 * The ARM pmap tries to maintain a per-mapping
 		 * reference bit.  The trouble is that it's kept in
 		 * the PV entry, not the PTE, so it's costly to access
 		 * here.  You would need to acquire the pvh global
 		 * lock, call pmap_find_pv(), and introduce a custom
 		 * version of vm_page_pa_tryrelock() that releases and
 		 * reacquires the pvh global lock.  In the end, I
 		 * doubt it's worthwhile.  This may falsely report
 		 * the given address as referenced.
 		 */
 		if ((m->md.pvh_attrs & PVF_REF) != 0)
 			val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
 	}
 	if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
 	    (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) {
 		/* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */
 		if (vm_page_pa_tryrelock(pmap, pa, locked_pa))
 			goto retry;
 	} else
 out:
 		PA_UNLOCK_COND(*locked_pa);
 	PMAP_UNLOCK(pmap);
 	return (val);
 }
 
 
 void
 pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz)
 {
 }
 
 
 /*
  *	Increase the starting virtual address of the given mapping if a
  *	different alignment might result in more superpage mappings.
  */
 void
 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
     vm_offset_t *addr, vm_size_t size)
 {
 }
 
 
 /*
  * Map a set of physical memory pages into the kernel virtual
  * address space. Return a pointer to where it is mapped. This
  * routine is intended to be used for mapping device memory,
  * NOT real memory.
  */
 void *
 pmap_mapdev(vm_offset_t pa, vm_size_t size)
 {
 	vm_offset_t va, tmpva, offset;
 	
 	offset = pa & PAGE_MASK;
 	size = roundup(size, PAGE_SIZE);
 	
 	GIANT_REQUIRED;
 	
 	va = kmem_alloc_nofault(kernel_map, size);
 	if (!va)
 		panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
 	for (tmpva = va; size > 0;) {
 		pmap_kenter_internal(tmpva, pa, 0);
 		size -= PAGE_SIZE;
 		tmpva += PAGE_SIZE;
 		pa += PAGE_SIZE;
 	}
 	
 	return ((void *)(va + offset));
 }
 
 #define BOOTSTRAP_DEBUG
 
 /*
  * pmap_map_section:
  *
  *	Create a single section mapping.
  */
 void
 pmap_map_section(vm_offset_t l1pt, vm_offset_t va, vm_offset_t pa,
     int prot, int cache)
 {
 	pd_entry_t *pde = (pd_entry_t *) l1pt;
 	pd_entry_t fl;
 
 	KASSERT(((va | pa) & L1_S_OFFSET) == 0, ("ouin2"));
 
 	switch (cache) {
 	case PTE_NOCACHE:
 	default:
 		fl = 0;
 		break;
 
 	case PTE_CACHE:
 		fl = pte_l1_s_cache_mode;
 		break;
 
 	case PTE_PAGETABLE:
 		fl = pte_l1_s_cache_mode_pt;
 		break;
 	}
 
 	pde[va >> L1_S_SHIFT] = L1_S_PROTO | pa |
 	    L1_S_PROT(PTE_KERNEL, prot) | fl | L1_S_DOM(PMAP_DOMAIN_KERNEL);
 	PTE_SYNC(&pde[va >> L1_S_SHIFT]);
 
 }
 
 /*
  * pmap_link_l2pt:
  *
  *	Link the L2 page table specified by l2pv.pv_pa into the L1
  *	page table at the slot for "va".
  */
 void
 pmap_link_l2pt(vm_offset_t l1pt, vm_offset_t va, struct pv_addr *l2pv)
 {
 	pd_entry_t *pde = (pd_entry_t *) l1pt, proto;
 	u_int slot = va >> L1_S_SHIFT;
 
 	proto = L1_S_DOM(PMAP_DOMAIN_KERNEL) | L1_C_PROTO;
 
 #ifdef VERBOSE_INIT_ARM
 	printf("pmap_link_l2pt: pa=0x%x va=0x%x\n", l2pv->pv_pa, l2pv->pv_va);
 #endif
 
 	pde[slot + 0] = proto | (l2pv->pv_pa + 0x000);
 
 	PTE_SYNC(&pde[slot]);
 
 	SLIST_INSERT_HEAD(&kernel_pt_list, l2pv, pv_list);
 
 	
 }
 
 /*
  * pmap_map_entry
  *
  * 	Create a single page mapping.
  */
 void
 pmap_map_entry(vm_offset_t l1pt, vm_offset_t va, vm_offset_t pa, int prot,
     int cache)
 {
 	pd_entry_t *pde = (pd_entry_t *) l1pt;
 	pt_entry_t fl;
 	pt_entry_t *pte;
 
 	KASSERT(((va | pa) & PAGE_MASK) == 0, ("ouin"));
 
 	switch (cache) {
 	case PTE_NOCACHE:
 	default:
 		fl = 0;
 		break;
 
 	case PTE_CACHE:
 		fl = pte_l2_s_cache_mode;
 		break;
 
 	case PTE_PAGETABLE:
 		fl = pte_l2_s_cache_mode_pt;
 		break;
 	}
 
 	if ((pde[va >> L1_S_SHIFT] & L1_TYPE_MASK) != L1_TYPE_C)
 		panic("pmap_map_entry: no L2 table for VA 0x%08x", va);
 
 	pte = (pt_entry_t *) kernel_pt_lookup(pde[L1_IDX(va)] & L1_C_ADDR_MASK);
 
 	if (pte == NULL)
 		panic("pmap_map_entry: can't find L2 table for VA 0x%08x", va);
 
 	pte[l2pte_index(va)] =
 	    L2_S_PROTO | pa | L2_S_PROT(PTE_KERNEL, prot) | fl;
 	PTE_SYNC(&pte[l2pte_index(va)]);
 }
 
 /*
  * pmap_map_chunk:
  *
  *	Map a chunk of memory using the most efficient mappings
  *	possible (section. large page, small page) into the
  *	provided L1 and L2 tables at the specified virtual address.
  */
 vm_size_t
 pmap_map_chunk(vm_offset_t l1pt, vm_offset_t va, vm_offset_t pa,
     vm_size_t size, int prot, int cache)
 {
 	pd_entry_t *pde = (pd_entry_t *) l1pt;
 	pt_entry_t *pte, f1, f2s, f2l;
 	vm_size_t resid;
 	int i;
 
 	resid = (size + (PAGE_SIZE - 1)) & ~(PAGE_SIZE - 1);
 
 	if (l1pt == 0)
 		panic("pmap_map_chunk: no L1 table provided");
 
 #ifdef VERBOSE_INIT_ARM
 	printf("pmap_map_chunk: pa=0x%x va=0x%x size=0x%x resid=0x%x "
 	    "prot=0x%x cache=%d\n", pa, va, size, resid, prot, cache);
 #endif
 
 	switch (cache) {
 	case PTE_NOCACHE:
 	default:
 		f1 = 0;
 		f2l = 0;
 		f2s = 0;
 		break;
 
 	case PTE_CACHE:
 		f1 = pte_l1_s_cache_mode;
 		f2l = pte_l2_l_cache_mode;
 		f2s = pte_l2_s_cache_mode;
 		break;
 
 	case PTE_PAGETABLE:
 		f1 = pte_l1_s_cache_mode_pt;
 		f2l = pte_l2_l_cache_mode_pt;
 		f2s = pte_l2_s_cache_mode_pt;
 		break;
 	}
 
 	size = resid;
 
 	while (resid > 0) {
 		/* See if we can use a section mapping. */
 		if (L1_S_MAPPABLE_P(va, pa, resid)) {
 #ifdef VERBOSE_INIT_ARM
 			printf("S");
 #endif
 			pde[va >> L1_S_SHIFT] = L1_S_PROTO | pa |
 			    L1_S_PROT(PTE_KERNEL, prot) | f1 |
 			    L1_S_DOM(PMAP_DOMAIN_KERNEL);
 			PTE_SYNC(&pde[va >> L1_S_SHIFT]);
 			va += L1_S_SIZE;
 			pa += L1_S_SIZE;
 			resid -= L1_S_SIZE;
 			continue;
 		}
 
 		/*
 		 * Ok, we're going to use an L2 table.  Make sure
 		 * one is actually in the corresponding L1 slot
 		 * for the current VA.
 		 */
 		if ((pde[va >> L1_S_SHIFT] & L1_TYPE_MASK) != L1_TYPE_C)
 			panic("pmap_map_chunk: no L2 table for VA 0x%08x", va);
 
 		pte = (pt_entry_t *) kernel_pt_lookup(
 		    pde[L1_IDX(va)] & L1_C_ADDR_MASK);
 		if (pte == NULL)
 			panic("pmap_map_chunk: can't find L2 table for VA"
 			    "0x%08x", va);
 		/* See if we can use a L2 large page mapping. */
 		if (L2_L_MAPPABLE_P(va, pa, resid)) {
 #ifdef VERBOSE_INIT_ARM
 			printf("L");
 #endif
 			for (i = 0; i < 16; i++) {
 				pte[l2pte_index(va) + i] =
 				    L2_L_PROTO | pa |
 				    L2_L_PROT(PTE_KERNEL, prot) | f2l;
 				PTE_SYNC(&pte[l2pte_index(va) + i]);
 			}
 			va += L2_L_SIZE;
 			pa += L2_L_SIZE;
 			resid -= L2_L_SIZE;
 			continue;
 		}
 
 		/* Use a small page mapping. */
 #ifdef VERBOSE_INIT_ARM
 		printf("P");
 #endif
 		pte[l2pte_index(va)] =
 		    L2_S_PROTO | pa | L2_S_PROT(PTE_KERNEL, prot) | f2s;
 		PTE_SYNC(&pte[l2pte_index(va)]);
 		va += PAGE_SIZE;
 		pa += PAGE_SIZE;
 		resid -= PAGE_SIZE;
 	}
 #ifdef VERBOSE_INIT_ARM
 	printf("\n");
 #endif
 	return (size);
 
 }
 
 /********************** Static device map routines ***************************/
 
 static const struct pmap_devmap *pmap_devmap_table;
 
 /*
  * Register the devmap table.  This is provided in case early console
  * initialization needs to register mappings created by bootstrap code
  * before pmap_devmap_bootstrap() is called.
  */
 void
 pmap_devmap_register(const struct pmap_devmap *table)
 {
 
 	pmap_devmap_table = table;
 }
 
 /*
  * Map all of the static regions in the devmap table, and remember
  * the devmap table so other parts of the kernel can look up entries
  * later.
  */
 void
 pmap_devmap_bootstrap(vm_offset_t l1pt, const struct pmap_devmap *table)
 {
 	int i;
 
 	pmap_devmap_table = table;
 
 	for (i = 0; pmap_devmap_table[i].pd_size != 0; i++) {
 #ifdef VERBOSE_INIT_ARM
 		printf("devmap: %08x -> %08x @ %08x\n",
 		    pmap_devmap_table[i].pd_pa,
 		    pmap_devmap_table[i].pd_pa +
 			pmap_devmap_table[i].pd_size - 1,
 		    pmap_devmap_table[i].pd_va);
 #endif
 		pmap_map_chunk(l1pt, pmap_devmap_table[i].pd_va,
 		    pmap_devmap_table[i].pd_pa,
 		    pmap_devmap_table[i].pd_size,
 		    pmap_devmap_table[i].pd_prot,
 		    pmap_devmap_table[i].pd_cache);
 	}
 }
 
 const struct pmap_devmap *
 pmap_devmap_find_pa(vm_paddr_t pa, vm_size_t size)
 {
 	int i;
 
 	if (pmap_devmap_table == NULL)
 		return (NULL);
 
 	for (i = 0; pmap_devmap_table[i].pd_size != 0; i++) {
 		if (pa >= pmap_devmap_table[i].pd_pa &&
 		    pa + size <= pmap_devmap_table[i].pd_pa +
 				 pmap_devmap_table[i].pd_size)
 			return (&pmap_devmap_table[i]);
 	}
 
 	return (NULL);
 }
 
 const struct pmap_devmap *
 pmap_devmap_find_va(vm_offset_t va, vm_size_t size)
 {
 	int i;
 
 	if (pmap_devmap_table == NULL)
 		return (NULL);
 
 	for (i = 0; pmap_devmap_table[i].pd_size != 0; i++) {
 		if (va >= pmap_devmap_table[i].pd_va &&
 		    va + size <= pmap_devmap_table[i].pd_va +
 				 pmap_devmap_table[i].pd_size)
 			return (&pmap_devmap_table[i]);
 	}
 
 	return (NULL);
 }
 
 void
 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
 {
 	/* 
 	 * Remember the memattr in a field that gets used to set the appropriate
 	 * bits in the PTEs as mappings are established.
 	 */
 	m->md.pv_memattr = ma;
 
 	/*
 	 * It appears that this function can only be called before any mappings
 	 * for the page are established on ARM.  If this ever changes, this code
 	 * will need to walk the pv_list and make each of the existing mappings
 	 * uncacheable, being careful to sync caches and PTEs (and maybe
 	 * invalidate TLB?) for any current mapping it modifies.
 	 */
 	if (m->md.pv_kva != 0 || TAILQ_FIRST(&m->md.pv_list) != NULL)
 		panic("Can't change memattr on page with existing mappings");
 }
 
 
Index: user/attilio/vmc-playground/sys/powerpc/booke/pmap.c
===================================================================
--- user/attilio/vmc-playground/sys/powerpc/booke/pmap.c	(revision 247223)
+++ user/attilio/vmc-playground/sys/powerpc/booke/pmap.c	(revision 247224)
@@ -1,3177 +1,3176 @@
 /*-
  * Copyright (C) 2007-2009 Semihalf, Rafal Jaworowski <raj@semihalf.com>
  * Copyright (C) 2006 Semihalf, Marian Balakowicz <m8@semihalf.com>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN
  * NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
  * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * Some hw specific parts of this pmap were derived or influenced
  * by NetBSD's ibm4xx pmap module. More generic code is shared with
  * a few other pmap modules from the FreeBSD tree.
  */
 
  /*
   * VM layout notes:
   *
   * Kernel and user threads run within one common virtual address space
   * defined by AS=0.
   *
   * Virtual address space layout:
   * -----------------------------
   * 0x0000_0000 - 0xafff_ffff	: user process
   * 0xb000_0000 - 0xbfff_ffff	: pmap_mapdev()-ed area (PCI/PCIE etc.)
   * 0xc000_0000 - 0xc0ff_ffff	: kernel reserved
   *   0xc000_0000 - data_end	: kernel code+data, env, metadata etc.
   * 0xc100_0000 - 0xfeef_ffff	: KVA
   *   0xc100_0000 - 0xc100_3fff : reserved for page zero/copy
   *   0xc100_4000 - 0xc200_3fff : reserved for ptbl bufs
   *   0xc200_4000 - 0xc200_8fff : guard page + kstack0
   *   0xc200_9000 - 0xfeef_ffff	: actual free KVA space
   * 0xfef0_0000 - 0xffff_ffff	: I/O devices region
   */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/malloc.h>
 #include <sys/ktr.h>
 #include <sys/proc.h>
 #include <sys/user.h>
 #include <sys/queue.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/linker.h>
 #include <sys/msgbuf.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/rwlock.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/vmmeter.h>
 
 #include <vm/vm.h>
 #include <vm/vm_page.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_param.h>
 #include <vm/vm_map.h>
 #include <vm/vm_pager.h>
 #include <vm/uma.h>
 
 #include <machine/cpu.h>
 #include <machine/pcb.h>
 #include <machine/platform.h>
 
 #include <machine/tlb.h>
 #include <machine/spr.h>
 #include <machine/md_var.h>
 #include <machine/mmuvar.h>
 #include <machine/pmap.h>
 #include <machine/pte.h>
 
 #include "mmu_if.h"
 
 #ifdef  DEBUG
 #define debugf(fmt, args...) printf(fmt, ##args)
 #else
 #define debugf(fmt, args...)
 #endif
 
 #define TODO			panic("%s: not implemented", __func__);
 
 extern struct mtx sched_lock;
 
 extern int dumpsys_minidump;
 
 extern unsigned char _etext[];
 extern unsigned char _end[];
 
 extern uint32_t *bootinfo;
 
 #ifdef SMP
 extern uint32_t bp_ntlb1s;
 #endif
 
 vm_paddr_t ccsrbar_pa;
 vm_paddr_t kernload;
 vm_offset_t kernstart;
 vm_size_t kernsize;
 
 /* Message buffer and tables. */
 static vm_offset_t data_start;
 static vm_size_t data_end;
 
 /* Phys/avail memory regions. */
 static struct mem_region *availmem_regions;
 static int availmem_regions_sz;
 static struct mem_region *physmem_regions;
 static int physmem_regions_sz;
 
 /* Reserved KVA space and mutex for mmu_booke_zero_page. */
 static vm_offset_t zero_page_va;
 static struct mtx zero_page_mutex;
 
 static struct mtx tlbivax_mutex;
 
 /*
  * Reserved KVA space for mmu_booke_zero_page_idle. This is used
  * by idle thred only, no lock required.
  */
 static vm_offset_t zero_page_idle_va;
 
 /* Reserved KVA space and mutex for mmu_booke_copy_page. */
 static vm_offset_t copy_page_src_va;
 static vm_offset_t copy_page_dst_va;
 static struct mtx copy_page_mutex;
 
 /**************************************************************************/
 /* PMAP */
 /**************************************************************************/
 
 static void mmu_booke_enter_locked(mmu_t, pmap_t, vm_offset_t, vm_page_t,
     vm_prot_t, boolean_t);
 
 unsigned int kptbl_min;		/* Index of the first kernel ptbl. */
 unsigned int kernel_ptbls;	/* Number of KVA ptbls. */
 
 /*
  * If user pmap is processed with mmu_booke_remove and the resident count
  * drops to 0, there are no more pages to remove, so we need not continue.
  */
 #define PMAP_REMOVE_DONE(pmap) \
 	((pmap) != kernel_pmap && (pmap)->pm_stats.resident_count == 0)
 
 extern void tid_flush(tlbtid_t);
 
 /**************************************************************************/
 /* TLB and TID handling */
 /**************************************************************************/
 
 /* Translation ID busy table */
 static volatile pmap_t tidbusy[MAXCPU][TID_MAX + 1];
 
 /*
  * TLB0 capabilities (entry, way numbers etc.). These can vary between e500
  * core revisions and should be read from h/w registers during early config.
  */
 uint32_t tlb0_entries;
 uint32_t tlb0_ways;
 uint32_t tlb0_entries_per_way;
 
 #define TLB0_ENTRIES		(tlb0_entries)
 #define TLB0_WAYS		(tlb0_ways)
 #define TLB0_ENTRIES_PER_WAY	(tlb0_entries_per_way)
 
 #define TLB1_ENTRIES 16
 
 /* In-ram copy of the TLB1 */
 static tlb_entry_t tlb1[TLB1_ENTRIES];
 
 /* Next free entry in the TLB1 */
 static unsigned int tlb1_idx;
 
 static tlbtid_t tid_alloc(struct pmap *);
 
 static void tlb_print_entry(int, uint32_t, uint32_t, uint32_t, uint32_t);
 
 static int tlb1_set_entry(vm_offset_t, vm_offset_t, vm_size_t, uint32_t);
 static void tlb1_write_entry(unsigned int);
 static int tlb1_iomapped(int, vm_paddr_t, vm_size_t, vm_offset_t *);
 static vm_size_t tlb1_mapin_region(vm_offset_t, vm_paddr_t, vm_size_t);
 
 static vm_size_t tsize2size(unsigned int);
 static unsigned int size2tsize(vm_size_t);
 static unsigned int ilog2(unsigned int);
 
 static void set_mas4_defaults(void);
 
 static inline void tlb0_flush_entry(vm_offset_t);
 static inline unsigned int tlb0_tableidx(vm_offset_t, unsigned int);
 
 /**************************************************************************/
 /* Page table management */
 /**************************************************************************/
 
 static struct rwlock_padalign pvh_global_lock;
 
 /* Data for the pv entry allocation mechanism */
 static uma_zone_t pvzone;
-static struct vm_object pvzone_obj;
 static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0;
 
 #define PV_ENTRY_ZONE_MIN	2048	/* min pv entries in uma zone */
 
 #ifndef PMAP_SHPGPERPROC
 #define PMAP_SHPGPERPROC	200
 #endif
 
 static void ptbl_init(void);
 static struct ptbl_buf *ptbl_buf_alloc(void);
 static void ptbl_buf_free(struct ptbl_buf *);
 static void ptbl_free_pmap_ptbl(pmap_t, pte_t *);
 
 static pte_t *ptbl_alloc(mmu_t, pmap_t, unsigned int);
 static void ptbl_free(mmu_t, pmap_t, unsigned int);
 static void ptbl_hold(mmu_t, pmap_t, unsigned int);
 static int ptbl_unhold(mmu_t, pmap_t, unsigned int);
 
 static vm_paddr_t pte_vatopa(mmu_t, pmap_t, vm_offset_t);
 static pte_t *pte_find(mmu_t, pmap_t, vm_offset_t);
 static void pte_enter(mmu_t, pmap_t, vm_page_t, vm_offset_t, uint32_t);
 static int pte_remove(mmu_t, pmap_t, vm_offset_t, uint8_t);
 
 static pv_entry_t pv_alloc(void);
 static void pv_free(pv_entry_t);
 static void pv_insert(pmap_t, vm_offset_t, vm_page_t);
 static void pv_remove(pmap_t, vm_offset_t, vm_page_t);
 
 /* Number of kva ptbl buffers, each covering one ptbl (PTBL_PAGES). */
 #define PTBL_BUFS		(128 * 16)
 
 struct ptbl_buf {
 	TAILQ_ENTRY(ptbl_buf) link;	/* list link */
 	vm_offset_t kva;		/* va of mapping */
 };
 
 /* ptbl free list and a lock used for access synchronization. */
 static TAILQ_HEAD(, ptbl_buf) ptbl_buf_freelist;
 static struct mtx ptbl_buf_freelist_lock;
 
 /* Base address of kva space allocated fot ptbl bufs. */
 static vm_offset_t ptbl_buf_pool_vabase;
 
 /* Pointer to ptbl_buf structures. */
 static struct ptbl_buf *ptbl_bufs;
 
 void pmap_bootstrap_ap(volatile uint32_t *);
 
 /*
  * Kernel MMU interface
  */
 static void		mmu_booke_change_wiring(mmu_t, pmap_t, vm_offset_t, boolean_t);
 static void		mmu_booke_clear_modify(mmu_t, vm_page_t);
 static void		mmu_booke_clear_reference(mmu_t, vm_page_t);
 static void		mmu_booke_copy(mmu_t, pmap_t, pmap_t, vm_offset_t,
     vm_size_t, vm_offset_t);
 static void		mmu_booke_copy_page(mmu_t, vm_page_t, vm_page_t);
 static void		mmu_booke_enter(mmu_t, pmap_t, vm_offset_t, vm_page_t,
     vm_prot_t, boolean_t);
 static void		mmu_booke_enter_object(mmu_t, pmap_t, vm_offset_t, vm_offset_t,
     vm_page_t, vm_prot_t);
 static void		mmu_booke_enter_quick(mmu_t, pmap_t, vm_offset_t, vm_page_t,
     vm_prot_t);
 static vm_paddr_t	mmu_booke_extract(mmu_t, pmap_t, vm_offset_t);
 static vm_page_t	mmu_booke_extract_and_hold(mmu_t, pmap_t, vm_offset_t,
     vm_prot_t);
 static void		mmu_booke_init(mmu_t);
 static boolean_t	mmu_booke_is_modified(mmu_t, vm_page_t);
 static boolean_t	mmu_booke_is_prefaultable(mmu_t, pmap_t, vm_offset_t);
 static boolean_t	mmu_booke_is_referenced(mmu_t, vm_page_t);
 static int		mmu_booke_ts_referenced(mmu_t, vm_page_t);
 static vm_offset_t	mmu_booke_map(mmu_t, vm_offset_t *, vm_paddr_t, vm_paddr_t,
     int);
 static int		mmu_booke_mincore(mmu_t, pmap_t, vm_offset_t,
     vm_paddr_t *);
 static void		mmu_booke_object_init_pt(mmu_t, pmap_t, vm_offset_t,
     vm_object_t, vm_pindex_t, vm_size_t);
 static boolean_t	mmu_booke_page_exists_quick(mmu_t, pmap_t, vm_page_t);
 static void		mmu_booke_page_init(mmu_t, vm_page_t);
 static int		mmu_booke_page_wired_mappings(mmu_t, vm_page_t);
 static void		mmu_booke_pinit(mmu_t, pmap_t);
 static void		mmu_booke_pinit0(mmu_t, pmap_t);
 static void		mmu_booke_protect(mmu_t, pmap_t, vm_offset_t, vm_offset_t,
     vm_prot_t);
 static void		mmu_booke_qenter(mmu_t, vm_offset_t, vm_page_t *, int);
 static void		mmu_booke_qremove(mmu_t, vm_offset_t, int);
 static void		mmu_booke_release(mmu_t, pmap_t);
 static void		mmu_booke_remove(mmu_t, pmap_t, vm_offset_t, vm_offset_t);
 static void		mmu_booke_remove_all(mmu_t, vm_page_t);
 static void		mmu_booke_remove_write(mmu_t, vm_page_t);
 static void		mmu_booke_zero_page(mmu_t, vm_page_t);
 static void		mmu_booke_zero_page_area(mmu_t, vm_page_t, int, int);
 static void		mmu_booke_zero_page_idle(mmu_t, vm_page_t);
 static void		mmu_booke_activate(mmu_t, struct thread *);
 static void		mmu_booke_deactivate(mmu_t, struct thread *);
 static void		mmu_booke_bootstrap(mmu_t, vm_offset_t, vm_offset_t);
 static void		*mmu_booke_mapdev(mmu_t, vm_paddr_t, vm_size_t);
 static void		mmu_booke_unmapdev(mmu_t, vm_offset_t, vm_size_t);
 static vm_paddr_t	mmu_booke_kextract(mmu_t, vm_offset_t);
 static void		mmu_booke_kenter(mmu_t, vm_offset_t, vm_paddr_t);
 static void		mmu_booke_kremove(mmu_t, vm_offset_t);
 static boolean_t	mmu_booke_dev_direct_mapped(mmu_t, vm_paddr_t, vm_size_t);
 static void		mmu_booke_sync_icache(mmu_t, pmap_t, vm_offset_t,
     vm_size_t);
 static vm_offset_t	mmu_booke_dumpsys_map(mmu_t, struct pmap_md *,
     vm_size_t, vm_size_t *);
 static void		mmu_booke_dumpsys_unmap(mmu_t, struct pmap_md *,
     vm_size_t, vm_offset_t);
 static struct pmap_md	*mmu_booke_scan_md(mmu_t, struct pmap_md *);
 
 static mmu_method_t mmu_booke_methods[] = {
 	/* pmap dispatcher interface */
 	MMUMETHOD(mmu_change_wiring,	mmu_booke_change_wiring),
 	MMUMETHOD(mmu_clear_modify,	mmu_booke_clear_modify),
 	MMUMETHOD(mmu_clear_reference,	mmu_booke_clear_reference),
 	MMUMETHOD(mmu_copy,		mmu_booke_copy),
 	MMUMETHOD(mmu_copy_page,	mmu_booke_copy_page),
 	MMUMETHOD(mmu_enter,		mmu_booke_enter),
 	MMUMETHOD(mmu_enter_object,	mmu_booke_enter_object),
 	MMUMETHOD(mmu_enter_quick,	mmu_booke_enter_quick),
 	MMUMETHOD(mmu_extract,		mmu_booke_extract),
 	MMUMETHOD(mmu_extract_and_hold,	mmu_booke_extract_and_hold),
 	MMUMETHOD(mmu_init,		mmu_booke_init),
 	MMUMETHOD(mmu_is_modified,	mmu_booke_is_modified),
 	MMUMETHOD(mmu_is_prefaultable,	mmu_booke_is_prefaultable),
 	MMUMETHOD(mmu_is_referenced,	mmu_booke_is_referenced),
 	MMUMETHOD(mmu_ts_referenced,	mmu_booke_ts_referenced),
 	MMUMETHOD(mmu_map,		mmu_booke_map),
 	MMUMETHOD(mmu_mincore,		mmu_booke_mincore),
 	MMUMETHOD(mmu_object_init_pt,	mmu_booke_object_init_pt),
 	MMUMETHOD(mmu_page_exists_quick,mmu_booke_page_exists_quick),
 	MMUMETHOD(mmu_page_init,	mmu_booke_page_init),
 	MMUMETHOD(mmu_page_wired_mappings, mmu_booke_page_wired_mappings),
 	MMUMETHOD(mmu_pinit,		mmu_booke_pinit),
 	MMUMETHOD(mmu_pinit0,		mmu_booke_pinit0),
 	MMUMETHOD(mmu_protect,		mmu_booke_protect),
 	MMUMETHOD(mmu_qenter,		mmu_booke_qenter),
 	MMUMETHOD(mmu_qremove,		mmu_booke_qremove),
 	MMUMETHOD(mmu_release,		mmu_booke_release),
 	MMUMETHOD(mmu_remove,		mmu_booke_remove),
 	MMUMETHOD(mmu_remove_all,	mmu_booke_remove_all),
 	MMUMETHOD(mmu_remove_write,	mmu_booke_remove_write),
 	MMUMETHOD(mmu_sync_icache,	mmu_booke_sync_icache),
 	MMUMETHOD(mmu_zero_page,	mmu_booke_zero_page),
 	MMUMETHOD(mmu_zero_page_area,	mmu_booke_zero_page_area),
 	MMUMETHOD(mmu_zero_page_idle,	mmu_booke_zero_page_idle),
 	MMUMETHOD(mmu_activate,		mmu_booke_activate),
 	MMUMETHOD(mmu_deactivate,	mmu_booke_deactivate),
 
 	/* Internal interfaces */
 	MMUMETHOD(mmu_bootstrap,	mmu_booke_bootstrap),
 	MMUMETHOD(mmu_dev_direct_mapped,mmu_booke_dev_direct_mapped),
 	MMUMETHOD(mmu_mapdev,		mmu_booke_mapdev),
 	MMUMETHOD(mmu_kenter,		mmu_booke_kenter),
 	MMUMETHOD(mmu_kextract,		mmu_booke_kextract),
 /*	MMUMETHOD(mmu_kremove,		mmu_booke_kremove),	*/
 	MMUMETHOD(mmu_unmapdev,		mmu_booke_unmapdev),
 
 	/* dumpsys() support */
 	MMUMETHOD(mmu_dumpsys_map,	mmu_booke_dumpsys_map),
 	MMUMETHOD(mmu_dumpsys_unmap,	mmu_booke_dumpsys_unmap),
 	MMUMETHOD(mmu_scan_md,		mmu_booke_scan_md),
 
 	{ 0, 0 }
 };
 
 MMU_DEF(booke_mmu, MMU_TYPE_BOOKE, mmu_booke_methods, 0);
 
 static inline void
 tlb_miss_lock(void)
 {
 #ifdef SMP
 	struct pcpu *pc;
 
 	if (!smp_started)
 		return;
 
 	STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) {
 		if (pc != pcpup) {
 
 			CTR3(KTR_PMAP, "%s: tlb miss LOCK of CPU=%d, "
 			    "tlb_lock=%p", __func__, pc->pc_cpuid, pc->pc_booke_tlb_lock);
 
 			KASSERT((pc->pc_cpuid != PCPU_GET(cpuid)),
 			    ("tlb_miss_lock: tried to lock self"));
 
 			tlb_lock(pc->pc_booke_tlb_lock);
 
 			CTR1(KTR_PMAP, "%s: locked", __func__);
 		}
 	}
 #endif
 }
 
 static inline void
 tlb_miss_unlock(void)
 {
 #ifdef SMP
 	struct pcpu *pc;
 
 	if (!smp_started)
 		return;
 
 	STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) {
 		if (pc != pcpup) {
 			CTR2(KTR_PMAP, "%s: tlb miss UNLOCK of CPU=%d",
 			    __func__, pc->pc_cpuid);
 
 			tlb_unlock(pc->pc_booke_tlb_lock);
 
 			CTR1(KTR_PMAP, "%s: unlocked", __func__);
 		}
 	}
 #endif
 }
 
 /* Return number of entries in TLB0. */
 static __inline void
 tlb0_get_tlbconf(void)
 {
 	uint32_t tlb0_cfg;
 
 	tlb0_cfg = mfspr(SPR_TLB0CFG);
 	tlb0_entries = tlb0_cfg & TLBCFG_NENTRY_MASK;
 	tlb0_ways = (tlb0_cfg & TLBCFG_ASSOC_MASK) >> TLBCFG_ASSOC_SHIFT;
 	tlb0_entries_per_way = tlb0_entries / tlb0_ways;
 }
 
 /* Initialize pool of kva ptbl buffers. */
 static void
 ptbl_init(void)
 {
 	int i;
 
 	CTR3(KTR_PMAP, "%s: s (ptbl_bufs = 0x%08x size 0x%08x)", __func__,
 	    (uint32_t)ptbl_bufs, sizeof(struct ptbl_buf) * PTBL_BUFS);
 	CTR3(KTR_PMAP, "%s: s (ptbl_buf_pool_vabase = 0x%08x size = 0x%08x)",
 	    __func__, ptbl_buf_pool_vabase, PTBL_BUFS * PTBL_PAGES * PAGE_SIZE);
 
 	mtx_init(&ptbl_buf_freelist_lock, "ptbl bufs lock", NULL, MTX_DEF);
 	TAILQ_INIT(&ptbl_buf_freelist);
 
 	for (i = 0; i < PTBL_BUFS; i++) {
 		ptbl_bufs[i].kva = ptbl_buf_pool_vabase + i * PTBL_PAGES * PAGE_SIZE;
 		TAILQ_INSERT_TAIL(&ptbl_buf_freelist, &ptbl_bufs[i], link);
 	}
 }
 
 /* Get a ptbl_buf from the freelist. */
 static struct ptbl_buf *
 ptbl_buf_alloc(void)
 {
 	struct ptbl_buf *buf;
 
 	mtx_lock(&ptbl_buf_freelist_lock);
 	buf = TAILQ_FIRST(&ptbl_buf_freelist);
 	if (buf != NULL)
 		TAILQ_REMOVE(&ptbl_buf_freelist, buf, link);
 	mtx_unlock(&ptbl_buf_freelist_lock);
 
 	CTR2(KTR_PMAP, "%s: buf = %p", __func__, buf);
 
 	return (buf);
 }
 
 /* Return ptbl buff to free pool. */
 static void
 ptbl_buf_free(struct ptbl_buf *buf)
 {
 
 	CTR2(KTR_PMAP, "%s: buf = %p", __func__, buf);
 
 	mtx_lock(&ptbl_buf_freelist_lock);
 	TAILQ_INSERT_TAIL(&ptbl_buf_freelist, buf, link);
 	mtx_unlock(&ptbl_buf_freelist_lock);
 }
 
 /*
  * Search the list of allocated ptbl bufs and find on list of allocated ptbls
  */
 static void
 ptbl_free_pmap_ptbl(pmap_t pmap, pte_t *ptbl)
 {
 	struct ptbl_buf *pbuf;
 
 	CTR2(KTR_PMAP, "%s: ptbl = %p", __func__, ptbl);
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	TAILQ_FOREACH(pbuf, &pmap->pm_ptbl_list, link)
 		if (pbuf->kva == (vm_offset_t)ptbl) {
 			/* Remove from pmap ptbl buf list. */
 			TAILQ_REMOVE(&pmap->pm_ptbl_list, pbuf, link);
 
 			/* Free corresponding ptbl buf. */
 			ptbl_buf_free(pbuf);
 			break;
 		}
 }
 
 /* Allocate page table. */
 static pte_t *
 ptbl_alloc(mmu_t mmu, pmap_t pmap, unsigned int pdir_idx)
 {
 	vm_page_t mtbl[PTBL_PAGES];
 	vm_page_t m;
 	struct ptbl_buf *pbuf;
 	unsigned int pidx;
 	pte_t *ptbl;
 	int i;
 
 	CTR4(KTR_PMAP, "%s: pmap = %p su = %d pdir_idx = %d", __func__, pmap,
 	    (pmap == kernel_pmap), pdir_idx);
 
 	KASSERT((pdir_idx <= (VM_MAXUSER_ADDRESS / PDIR_SIZE)),
 	    ("ptbl_alloc: invalid pdir_idx"));
 	KASSERT((pmap->pm_pdir[pdir_idx] == NULL),
 	    ("pte_alloc: valid ptbl entry exists!"));
 
 	pbuf = ptbl_buf_alloc();
 	if (pbuf == NULL)
 		panic("pte_alloc: couldn't alloc kernel virtual memory");
 		
 	ptbl = (pte_t *)pbuf->kva;
 
 	CTR2(KTR_PMAP, "%s: ptbl kva = %p", __func__, ptbl);
 
 	/* Allocate ptbl pages, this will sleep! */
 	for (i = 0; i < PTBL_PAGES; i++) {
 		pidx = (PTBL_PAGES * pdir_idx) + i;
 		while ((m = vm_page_alloc(NULL, pidx,
 		    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
 
 			PMAP_UNLOCK(pmap);
 			rw_wunlock(&pvh_global_lock);
 			VM_WAIT;
 			rw_wlock(&pvh_global_lock);
 			PMAP_LOCK(pmap);
 		}
 		mtbl[i] = m;
 	}
 
 	/* Map allocated pages into kernel_pmap. */
 	mmu_booke_qenter(mmu, (vm_offset_t)ptbl, mtbl, PTBL_PAGES);
 
 	/* Zero whole ptbl. */
 	bzero((caddr_t)ptbl, PTBL_PAGES * PAGE_SIZE);
 
 	/* Add pbuf to the pmap ptbl bufs list. */
 	TAILQ_INSERT_TAIL(&pmap->pm_ptbl_list, pbuf, link);
 
 	return (ptbl);
 }
 
 /* Free ptbl pages and invalidate pdir entry. */
 static void
 ptbl_free(mmu_t mmu, pmap_t pmap, unsigned int pdir_idx)
 {
 	pte_t *ptbl;
 	vm_paddr_t pa;
 	vm_offset_t va;
 	vm_page_t m;
 	int i;
 
 	CTR4(KTR_PMAP, "%s: pmap = %p su = %d pdir_idx = %d", __func__, pmap,
 	    (pmap == kernel_pmap), pdir_idx);
 
 	KASSERT((pdir_idx <= (VM_MAXUSER_ADDRESS / PDIR_SIZE)),
 	    ("ptbl_free: invalid pdir_idx"));
 
 	ptbl = pmap->pm_pdir[pdir_idx];
 
 	CTR2(KTR_PMAP, "%s: ptbl = %p", __func__, ptbl);
 
 	KASSERT((ptbl != NULL), ("ptbl_free: null ptbl"));
 
 	/*
 	 * Invalidate the pdir entry as soon as possible, so that other CPUs
 	 * don't attempt to look up the page tables we are releasing.
 	 */
 	mtx_lock_spin(&tlbivax_mutex);
 	tlb_miss_lock();
 	
 	pmap->pm_pdir[pdir_idx] = NULL;
 
 	tlb_miss_unlock();
 	mtx_unlock_spin(&tlbivax_mutex);
 
 	for (i = 0; i < PTBL_PAGES; i++) {
 		va = ((vm_offset_t)ptbl + (i * PAGE_SIZE));
 		pa = pte_vatopa(mmu, kernel_pmap, va);
 		m = PHYS_TO_VM_PAGE(pa);
 		vm_page_free_zero(m);
 		atomic_subtract_int(&cnt.v_wire_count, 1);
 		mmu_booke_kremove(mmu, va);
 	}
 
 	ptbl_free_pmap_ptbl(pmap, ptbl);
 }
 
 /*
  * Decrement ptbl pages hold count and attempt to free ptbl pages.
  * Called when removing pte entry from ptbl.
  *
  * Return 1 if ptbl pages were freed.
  */
 static int
 ptbl_unhold(mmu_t mmu, pmap_t pmap, unsigned int pdir_idx)
 {
 	pte_t *ptbl;
 	vm_paddr_t pa;
 	vm_page_t m;
 	int i;
 
 	CTR4(KTR_PMAP, "%s: pmap = %p su = %d pdir_idx = %d", __func__, pmap,
 	    (pmap == kernel_pmap), pdir_idx);
 
 	KASSERT((pdir_idx <= (VM_MAXUSER_ADDRESS / PDIR_SIZE)),
 	    ("ptbl_unhold: invalid pdir_idx"));
 	KASSERT((pmap != kernel_pmap),
 	    ("ptbl_unhold: unholding kernel ptbl!"));
 
 	ptbl = pmap->pm_pdir[pdir_idx];
 
 	//debugf("ptbl_unhold: ptbl = 0x%08x\n", (u_int32_t)ptbl);
 	KASSERT(((vm_offset_t)ptbl >= VM_MIN_KERNEL_ADDRESS),
 	    ("ptbl_unhold: non kva ptbl"));
 
 	/* decrement hold count */
 	for (i = 0; i < PTBL_PAGES; i++) {
 		pa = pte_vatopa(mmu, kernel_pmap,
 		    (vm_offset_t)ptbl + (i * PAGE_SIZE));
 		m = PHYS_TO_VM_PAGE(pa);
 		m->wire_count--;
 	}
 
 	/*
 	 * Free ptbl pages if there are no pte etries in this ptbl.
 	 * wire_count has the same value for all ptbl pages, so check the last
 	 * page.
 	 */
 	if (m->wire_count == 0) {
 		ptbl_free(mmu, pmap, pdir_idx);
 
 		//debugf("ptbl_unhold: e (freed ptbl)\n");
 		return (1);
 	}
 
 	return (0);
 }
 
 /*
  * Increment hold count for ptbl pages. This routine is used when a new pte
  * entry is being inserted into the ptbl.
  */
 static void
 ptbl_hold(mmu_t mmu, pmap_t pmap, unsigned int pdir_idx)
 {
 	vm_paddr_t pa;
 	pte_t *ptbl;
 	vm_page_t m;
 	int i;
 
 	CTR3(KTR_PMAP, "%s: pmap = %p pdir_idx = %d", __func__, pmap,
 	    pdir_idx);
 
 	KASSERT((pdir_idx <= (VM_MAXUSER_ADDRESS / PDIR_SIZE)),
 	    ("ptbl_hold: invalid pdir_idx"));
 	KASSERT((pmap != kernel_pmap),
 	    ("ptbl_hold: holding kernel ptbl!"));
 
 	ptbl = pmap->pm_pdir[pdir_idx];
 
 	KASSERT((ptbl != NULL), ("ptbl_hold: null ptbl"));
 
 	for (i = 0; i < PTBL_PAGES; i++) {
 		pa = pte_vatopa(mmu, kernel_pmap,
 		    (vm_offset_t)ptbl + (i * PAGE_SIZE));
 		m = PHYS_TO_VM_PAGE(pa);
 		m->wire_count++;
 	}
 }
 
 /* Allocate pv_entry structure. */
 pv_entry_t
 pv_alloc(void)
 {
 	pv_entry_t pv;
 
 	pv_entry_count++;
 	if (pv_entry_count > pv_entry_high_water)
 		pagedaemon_wakeup();
 	pv = uma_zalloc(pvzone, M_NOWAIT);
 
 	return (pv);
 }
 
 /* Free pv_entry structure. */
 static __inline void
 pv_free(pv_entry_t pve)
 {
 
 	pv_entry_count--;
 	uma_zfree(pvzone, pve);
 }
 
 
 /* Allocate and initialize pv_entry structure. */
 static void
 pv_insert(pmap_t pmap, vm_offset_t va, vm_page_t m)
 {
 	pv_entry_t pve;
 
 	//int su = (pmap == kernel_pmap);
 	//debugf("pv_insert: s (su = %d pmap = 0x%08x va = 0x%08x m = 0x%08x)\n", su,
 	//	(u_int32_t)pmap, va, (u_int32_t)m);
 
 	pve = pv_alloc();
 	if (pve == NULL)
 		panic("pv_insert: no pv entries!");
 
 	pve->pv_pmap = pmap;
 	pve->pv_va = va;
 
 	/* add to pv_list */
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 
 	TAILQ_INSERT_TAIL(&m->md.pv_list, pve, pv_link);
 
 	//debugf("pv_insert: e\n");
 }
 
 /* Destroy pv entry. */
 static void
 pv_remove(pmap_t pmap, vm_offset_t va, vm_page_t m)
 {
 	pv_entry_t pve;
 
 	//int su = (pmap == kernel_pmap);
 	//debugf("pv_remove: s (su = %d pmap = 0x%08x va = 0x%08x)\n", su, (u_int32_t)pmap, va);
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 
 	/* find pv entry */
 	TAILQ_FOREACH(pve, &m->md.pv_list, pv_link) {
 		if ((pmap == pve->pv_pmap) && (va == pve->pv_va)) {
 			/* remove from pv_list */
 			TAILQ_REMOVE(&m->md.pv_list, pve, pv_link);
 			if (TAILQ_EMPTY(&m->md.pv_list))
 				vm_page_aflag_clear(m, PGA_WRITEABLE);
 
 			/* free pv entry struct */
 			pv_free(pve);
 			break;
 		}
 	}
 
 	//debugf("pv_remove: e\n");
 }
 
 /*
  * Clean pte entry, try to free page table page if requested.
  *
  * Return 1 if ptbl pages were freed, otherwise return 0.
  */
 static int
 pte_remove(mmu_t mmu, pmap_t pmap, vm_offset_t va, uint8_t flags)
 {
 	unsigned int pdir_idx = PDIR_IDX(va);
 	unsigned int ptbl_idx = PTBL_IDX(va);
 	vm_page_t m;
 	pte_t *ptbl;
 	pte_t *pte;
 
 	//int su = (pmap == kernel_pmap);
 	//debugf("pte_remove: s (su = %d pmap = 0x%08x va = 0x%08x flags = %d)\n",
 	//		su, (u_int32_t)pmap, va, flags);
 
 	ptbl = pmap->pm_pdir[pdir_idx];
 	KASSERT(ptbl, ("pte_remove: null ptbl"));
 
 	pte = &ptbl[ptbl_idx];
 
 	if (pte == NULL || !PTE_ISVALID(pte))
 		return (0);
 
 	if (PTE_ISWIRED(pte))
 		pmap->pm_stats.wired_count--;
 
 	/* Handle managed entry. */
 	if (PTE_ISMANAGED(pte)) {
 		/* Get vm_page_t for mapped pte. */
 		m = PHYS_TO_VM_PAGE(PTE_PA(pte));
 
 		if (PTE_ISMODIFIED(pte))
 			vm_page_dirty(m);
 
 		if (PTE_ISREFERENCED(pte))
 			vm_page_aflag_set(m, PGA_REFERENCED);
 
 		pv_remove(pmap, va, m);
 	}
 
 	mtx_lock_spin(&tlbivax_mutex);
 	tlb_miss_lock();
 
 	tlb0_flush_entry(va);
 	pte->flags = 0;
 	pte->rpn = 0;
 
 	tlb_miss_unlock();
 	mtx_unlock_spin(&tlbivax_mutex);
 
 	pmap->pm_stats.resident_count--;
 
 	if (flags & PTBL_UNHOLD) {
 		//debugf("pte_remove: e (unhold)\n");
 		return (ptbl_unhold(mmu, pmap, pdir_idx));
 	}
 
 	//debugf("pte_remove: e\n");
 	return (0);
 }
 
 /*
  * Insert PTE for a given page and virtual address.
  */
 static void
 pte_enter(mmu_t mmu, pmap_t pmap, vm_page_t m, vm_offset_t va, uint32_t flags)
 {
 	unsigned int pdir_idx = PDIR_IDX(va);
 	unsigned int ptbl_idx = PTBL_IDX(va);
 	pte_t *ptbl, *pte;
 
 	CTR4(KTR_PMAP, "%s: su = %d pmap = %p va = %p", __func__,
 	    pmap == kernel_pmap, pmap, va);
 
 	/* Get the page table pointer. */
 	ptbl = pmap->pm_pdir[pdir_idx];
 
 	if (ptbl == NULL) {
 		/* Allocate page table pages. */
 		ptbl = ptbl_alloc(mmu, pmap, pdir_idx);
 	} else {
 		/*
 		 * Check if there is valid mapping for requested
 		 * va, if there is, remove it.
 		 */
 		pte = &pmap->pm_pdir[pdir_idx][ptbl_idx];
 		if (PTE_ISVALID(pte)) {
 			pte_remove(mmu, pmap, va, PTBL_HOLD);
 		} else {
 			/*
 			 * pte is not used, increment hold count
 			 * for ptbl pages.
 			 */
 			if (pmap != kernel_pmap)
 				ptbl_hold(mmu, pmap, pdir_idx);
 		}
 	}
 
 	/*
 	 * Insert pv_entry into pv_list for mapped page if part of managed
 	 * memory.
 	 */
 	if ((m->oflags & VPO_UNMANAGED) == 0) {
 		flags |= PTE_MANAGED;
 
 		/* Create and insert pv entry. */
 		pv_insert(pmap, va, m);
 	}
 
 	pmap->pm_stats.resident_count++;
 	
 	mtx_lock_spin(&tlbivax_mutex);
 	tlb_miss_lock();
 
 	tlb0_flush_entry(va);
 	if (pmap->pm_pdir[pdir_idx] == NULL) {
 		/*
 		 * If we just allocated a new page table, hook it in
 		 * the pdir.
 		 */
 		pmap->pm_pdir[pdir_idx] = ptbl;
 	}
 	pte = &(pmap->pm_pdir[pdir_idx][ptbl_idx]);
 	pte->rpn = VM_PAGE_TO_PHYS(m) & ~PTE_PA_MASK;
 	pte->flags |= (PTE_VALID | flags);
 
 	tlb_miss_unlock();
 	mtx_unlock_spin(&tlbivax_mutex);
 }
 
 /* Return the pa for the given pmap/va. */
 static vm_paddr_t
 pte_vatopa(mmu_t mmu, pmap_t pmap, vm_offset_t va)
 {
 	vm_paddr_t pa = 0;
 	pte_t *pte;
 
 	pte = pte_find(mmu, pmap, va);
 	if ((pte != NULL) && PTE_ISVALID(pte))
 		pa = (PTE_PA(pte) | (va & PTE_PA_MASK));
 	return (pa);
 }
 
 /* Get a pointer to a PTE in a page table. */
 static pte_t *
 pte_find(mmu_t mmu, pmap_t pmap, vm_offset_t va)
 {
 	unsigned int pdir_idx = PDIR_IDX(va);
 	unsigned int ptbl_idx = PTBL_IDX(va);
 
 	KASSERT((pmap != NULL), ("pte_find: invalid pmap"));
 
 	if (pmap->pm_pdir[pdir_idx])
 		return (&(pmap->pm_pdir[pdir_idx][ptbl_idx]));
 
 	return (NULL);
 }
 
 /**************************************************************************/
 /* PMAP related */
 /**************************************************************************/
 
 /*
  * This is called during booke_init, before the system is really initialized.
  */
 static void
 mmu_booke_bootstrap(mmu_t mmu, vm_offset_t start, vm_offset_t kernelend)
 {
 	vm_offset_t phys_kernelend;
 	struct mem_region *mp, *mp1;
 	int cnt, i, j;
 	u_int s, e, sz;
 	u_int phys_avail_count;
 	vm_size_t physsz, hwphyssz, kstack0_sz;
 	vm_offset_t kernel_pdir, kstack0, va;
 	vm_paddr_t kstack0_phys;
 	void *dpcpu;
 	pte_t *pte;
 
 	debugf("mmu_booke_bootstrap: entered\n");
 
 	/* Initialize invalidation mutex */
 	mtx_init(&tlbivax_mutex, "tlbivax", NULL, MTX_SPIN);
 
 	/* Read TLB0 size and associativity. */
 	tlb0_get_tlbconf();
 
 	/*
 	 * Align kernel start and end address (kernel image).
 	 * Note that kernel end does not necessarily relate to kernsize.
 	 * kernsize is the size of the kernel that is actually mapped.
 	 * Also note that "start - 1" is deliberate. With SMP, the
 	 * entry point is exactly a page from the actual load address.
 	 * As such, trunc_page() has no effect and we're off by a page.
 	 * Since we always have the ELF header between the load address
 	 * and the entry point, we can safely subtract 1 to compensate.
 	 */
 	kernstart = trunc_page(start - 1);
 	data_start = round_page(kernelend);
 	data_end = data_start;
 
 	/*
 	 * Addresses of preloaded modules (like file systems) use
 	 * physical addresses. Make sure we relocate those into
 	 * virtual addresses.
 	 */
 	preload_addr_relocate = kernstart - kernload;
 
 	/* Allocate the dynamic per-cpu area. */
 	dpcpu = (void *)data_end;
 	data_end += DPCPU_SIZE;
 
 	/* Allocate space for the message buffer. */
 	msgbufp = (struct msgbuf *)data_end;
 	data_end += msgbufsize;
 	debugf(" msgbufp at 0x%08x end = 0x%08x\n", (uint32_t)msgbufp,
 	    data_end);
 
 	data_end = round_page(data_end);
 
 	/* Allocate space for ptbl_bufs. */
 	ptbl_bufs = (struct ptbl_buf *)data_end;
 	data_end += sizeof(struct ptbl_buf) * PTBL_BUFS;
 	debugf(" ptbl_bufs at 0x%08x end = 0x%08x\n", (uint32_t)ptbl_bufs,
 	    data_end);
 
 	data_end = round_page(data_end);
 
 	/* Allocate PTE tables for kernel KVA. */
 	kernel_pdir = data_end;
 	kernel_ptbls = (VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS +
 	    PDIR_SIZE - 1) / PDIR_SIZE;
 	data_end += kernel_ptbls * PTBL_PAGES * PAGE_SIZE;
 	debugf(" kernel ptbls: %d\n", kernel_ptbls);
 	debugf(" kernel pdir at 0x%08x end = 0x%08x\n", kernel_pdir, data_end);
 
 	debugf(" data_end: 0x%08x\n", data_end);
 	if (data_end - kernstart > kernsize) {
 		kernsize += tlb1_mapin_region(kernstart + kernsize,
 		    kernload + kernsize, (data_end - kernstart) - kernsize);
 	}
 	data_end = kernstart + kernsize;
 	debugf(" updated data_end: 0x%08x\n", data_end);
 
 	/*
 	 * Clear the structures - note we can only do it safely after the
 	 * possible additional TLB1 translations are in place (above) so that
 	 * all range up to the currently calculated 'data_end' is covered.
 	 */
 	dpcpu_init(dpcpu, 0);
 	memset((void *)ptbl_bufs, 0, sizeof(struct ptbl_buf) * PTBL_SIZE);
 	memset((void *)kernel_pdir, 0, kernel_ptbls * PTBL_PAGES * PAGE_SIZE);
 
 	/*******************************************************/
 	/* Set the start and end of kva. */
 	/*******************************************************/
 	virtual_avail = round_page(data_end);
 	virtual_end = VM_MAX_KERNEL_ADDRESS;
 
 	/* Allocate KVA space for page zero/copy operations. */
 	zero_page_va = virtual_avail;
 	virtual_avail += PAGE_SIZE;
 	zero_page_idle_va = virtual_avail;
 	virtual_avail += PAGE_SIZE;
 	copy_page_src_va = virtual_avail;
 	virtual_avail += PAGE_SIZE;
 	copy_page_dst_va = virtual_avail;
 	virtual_avail += PAGE_SIZE;
 	debugf("zero_page_va = 0x%08x\n", zero_page_va);
 	debugf("zero_page_idle_va = 0x%08x\n", zero_page_idle_va);
 	debugf("copy_page_src_va = 0x%08x\n", copy_page_src_va);
 	debugf("copy_page_dst_va = 0x%08x\n", copy_page_dst_va);
 
 	/* Initialize page zero/copy mutexes. */
 	mtx_init(&zero_page_mutex, "mmu_booke_zero_page", NULL, MTX_DEF);
 	mtx_init(&copy_page_mutex, "mmu_booke_copy_page", NULL, MTX_DEF);
 
 	/* Allocate KVA space for ptbl bufs. */
 	ptbl_buf_pool_vabase = virtual_avail;
 	virtual_avail += PTBL_BUFS * PTBL_PAGES * PAGE_SIZE;
 	debugf("ptbl_buf_pool_vabase = 0x%08x end = 0x%08x\n",
 	    ptbl_buf_pool_vabase, virtual_avail);
 
 	/* Calculate corresponding physical addresses for the kernel region. */
 	phys_kernelend = kernload + kernsize;
 	debugf("kernel image and allocated data:\n");
 	debugf(" kernload    = 0x%08x\n", kernload);
 	debugf(" kernstart   = 0x%08x\n", kernstart);
 	debugf(" kernsize    = 0x%08x\n", kernsize);
 
 	if (sizeof(phys_avail) / sizeof(phys_avail[0]) < availmem_regions_sz)
 		panic("mmu_booke_bootstrap: phys_avail too small");
 
 	/*
 	 * Remove kernel physical address range from avail regions list. Page
 	 * align all regions.  Non-page aligned memory isn't very interesting
 	 * to us.  Also, sort the entries for ascending addresses.
 	 */
 
 	/* Retrieve phys/avail mem regions */
 	mem_regions(&physmem_regions, &physmem_regions_sz,
 	    &availmem_regions, &availmem_regions_sz);
 	sz = 0;
 	cnt = availmem_regions_sz;
 	debugf("processing avail regions:\n");
 	for (mp = availmem_regions; mp->mr_size; mp++) {
 		s = mp->mr_start;
 		e = mp->mr_start + mp->mr_size;
 		debugf(" %08x-%08x -> ", s, e);
 		/* Check whether this region holds all of the kernel. */
 		if (s < kernload && e > phys_kernelend) {
 			availmem_regions[cnt].mr_start = phys_kernelend;
 			availmem_regions[cnt++].mr_size = e - phys_kernelend;
 			e = kernload;
 		}
 		/* Look whether this regions starts within the kernel. */
 		if (s >= kernload && s < phys_kernelend) {
 			if (e <= phys_kernelend)
 				goto empty;
 			s = phys_kernelend;
 		}
 		/* Now look whether this region ends within the kernel. */
 		if (e > kernload && e <= phys_kernelend) {
 			if (s >= kernload)
 				goto empty;
 			e = kernload;
 		}
 		/* Now page align the start and size of the region. */
 		s = round_page(s);
 		e = trunc_page(e);
 		if (e < s)
 			e = s;
 		sz = e - s;
 		debugf("%08x-%08x = %x\n", s, e, sz);
 
 		/* Check whether some memory is left here. */
 		if (sz == 0) {
 		empty:
 			memmove(mp, mp + 1,
 			    (cnt - (mp - availmem_regions)) * sizeof(*mp));
 			cnt--;
 			mp--;
 			continue;
 		}
 
 		/* Do an insertion sort. */
 		for (mp1 = availmem_regions; mp1 < mp; mp1++)
 			if (s < mp1->mr_start)
 				break;
 		if (mp1 < mp) {
 			memmove(mp1 + 1, mp1, (char *)mp - (char *)mp1);
 			mp1->mr_start = s;
 			mp1->mr_size = sz;
 		} else {
 			mp->mr_start = s;
 			mp->mr_size = sz;
 		}
 	}
 	availmem_regions_sz = cnt;
 
 	/*******************************************************/
 	/* Steal physical memory for kernel stack from the end */
 	/* of the first avail region                           */
 	/*******************************************************/
 	kstack0_sz = KSTACK_PAGES * PAGE_SIZE;
 	kstack0_phys = availmem_regions[0].mr_start +
 	    availmem_regions[0].mr_size;
 	kstack0_phys -= kstack0_sz;
 	availmem_regions[0].mr_size -= kstack0_sz;
 
 	/*******************************************************/
 	/* Fill in phys_avail table, based on availmem_regions */
 	/*******************************************************/
 	phys_avail_count = 0;
 	physsz = 0;
 	hwphyssz = 0;
 	TUNABLE_ULONG_FETCH("hw.physmem", (u_long *) &hwphyssz);
 
 	debugf("fill in phys_avail:\n");
 	for (i = 0, j = 0; i < availmem_regions_sz; i++, j += 2) {
 
 		debugf(" region: 0x%08x - 0x%08x (0x%08x)\n",
 		    availmem_regions[i].mr_start,
 		    availmem_regions[i].mr_start +
 		        availmem_regions[i].mr_size,
 		    availmem_regions[i].mr_size);
 
 		if (hwphyssz != 0 &&
 		    (physsz + availmem_regions[i].mr_size) >= hwphyssz) {
 			debugf(" hw.physmem adjust\n");
 			if (physsz < hwphyssz) {
 				phys_avail[j] = availmem_regions[i].mr_start;
 				phys_avail[j + 1] =
 				    availmem_regions[i].mr_start +
 				    hwphyssz - physsz;
 				physsz = hwphyssz;
 				phys_avail_count++;
 			}
 			break;
 		}
 
 		phys_avail[j] = availmem_regions[i].mr_start;
 		phys_avail[j + 1] = availmem_regions[i].mr_start +
 		    availmem_regions[i].mr_size;
 		phys_avail_count++;
 		physsz += availmem_regions[i].mr_size;
 	}
 	physmem = btoc(physsz);
 
 	/* Calculate the last available physical address. */
 	for (i = 0; phys_avail[i + 2] != 0; i += 2)
 		;
 	Maxmem = powerpc_btop(phys_avail[i + 1]);
 
 	debugf("Maxmem = 0x%08lx\n", Maxmem);
 	debugf("phys_avail_count = %d\n", phys_avail_count);
 	debugf("physsz = 0x%08x physmem = %ld (0x%08lx)\n", physsz, physmem,
 	    physmem);
 
 	/*******************************************************/
 	/* Initialize (statically allocated) kernel pmap. */
 	/*******************************************************/
 	PMAP_LOCK_INIT(kernel_pmap);
 	kptbl_min = VM_MIN_KERNEL_ADDRESS / PDIR_SIZE;
 
 	debugf("kernel_pmap = 0x%08x\n", (uint32_t)kernel_pmap);
 	debugf("kptbl_min = %d, kernel_ptbls = %d\n", kptbl_min, kernel_ptbls);
 	debugf("kernel pdir range: 0x%08x - 0x%08x\n",
 	    kptbl_min * PDIR_SIZE, (kptbl_min + kernel_ptbls) * PDIR_SIZE - 1);
 
 	/* Initialize kernel pdir */
 	for (i = 0; i < kernel_ptbls; i++)
 		kernel_pmap->pm_pdir[kptbl_min + i] =
 		    (pte_t *)(kernel_pdir + (i * PAGE_SIZE * PTBL_PAGES));
 
 	for (i = 0; i < MAXCPU; i++) {
 		kernel_pmap->pm_tid[i] = TID_KERNEL;
 		
 		/* Initialize each CPU's tidbusy entry 0 with kernel_pmap */
 		tidbusy[i][0] = kernel_pmap;
 	}
 
 	/*
 	 * Fill in PTEs covering kernel code and data. They are not required
 	 * for address translation, as this area is covered by static TLB1
 	 * entries, but for pte_vatopa() to work correctly with kernel area
 	 * addresses.
 	 */
 	for (va = kernstart; va < data_end; va += PAGE_SIZE) {
 		pte = &(kernel_pmap->pm_pdir[PDIR_IDX(va)][PTBL_IDX(va)]);
 		pte->rpn = kernload + (va - kernstart);
 		pte->flags = PTE_M | PTE_SR | PTE_SW | PTE_SX | PTE_WIRED |
 		    PTE_VALID;
 	}
 	/* Mark kernel_pmap active on all CPUs */
 	CPU_FILL(&kernel_pmap->pm_active);
 
  	/*
 	 * Initialize the global pv list lock.
 	 */
 	rw_init(&pvh_global_lock, "pmap pv global");
 
 	/*******************************************************/
 	/* Final setup */
 	/*******************************************************/
 
 	/* Enter kstack0 into kernel map, provide guard page */
 	kstack0 = virtual_avail + KSTACK_GUARD_PAGES * PAGE_SIZE;
 	thread0.td_kstack = kstack0;
 	thread0.td_kstack_pages = KSTACK_PAGES;
 
 	debugf("kstack_sz = 0x%08x\n", kstack0_sz);
 	debugf("kstack0_phys at 0x%08x - 0x%08x\n",
 	    kstack0_phys, kstack0_phys + kstack0_sz);
 	debugf("kstack0 at 0x%08x - 0x%08x\n", kstack0, kstack0 + kstack0_sz);
 	
 	virtual_avail += KSTACK_GUARD_PAGES * PAGE_SIZE + kstack0_sz;
 	for (i = 0; i < KSTACK_PAGES; i++) {
 		mmu_booke_kenter(mmu, kstack0, kstack0_phys);
 		kstack0 += PAGE_SIZE;
 		kstack0_phys += PAGE_SIZE;
 	}
 	
 	debugf("virtual_avail = %08x\n", virtual_avail);
 	debugf("virtual_end   = %08x\n", virtual_end);
 
 	debugf("mmu_booke_bootstrap: exit\n");
 }
 
 void
 pmap_bootstrap_ap(volatile uint32_t *trcp __unused)
 {
 	int i;
 
 	/*
 	 * Finish TLB1 configuration: the BSP already set up its TLB1 and we
 	 * have the snapshot of its contents in the s/w tlb1[] table, so use
 	 * these values directly to (re)program AP's TLB1 hardware.
 	 */
 	for (i = bp_ntlb1s; i < tlb1_idx; i++) {
 		/* Skip invalid entries */
 		if (!(tlb1[i].mas1 & MAS1_VALID))
 			continue;
 
 		tlb1_write_entry(i);
 	}
 
 	set_mas4_defaults();
 }
 
 /*
  * Get the physical page address for the given pmap/virtual address.
  */
 static vm_paddr_t
 mmu_booke_extract(mmu_t mmu, pmap_t pmap, vm_offset_t va)
 {
 	vm_paddr_t pa;
 
 	PMAP_LOCK(pmap);
 	pa = pte_vatopa(mmu, pmap, va);
 	PMAP_UNLOCK(pmap);
 
 	return (pa);
 }
 
 /*
  * Extract the physical page address associated with the given
  * kernel virtual address.
  */
 static vm_paddr_t
 mmu_booke_kextract(mmu_t mmu, vm_offset_t va)
 {
 
 	return (pte_vatopa(mmu, kernel_pmap, va));
 }
 
 /*
  * Initialize the pmap module.
  * Called by vm_init, to initialize any structures that the pmap
  * system needs to map virtual memory.
  */
 static void
 mmu_booke_init(mmu_t mmu)
 {
 	int shpgperproc = PMAP_SHPGPERPROC;
 
 	/*
 	 * Initialize the address space (zone) for the pv entries.  Set a
 	 * high water mark so that the system can recover from excessive
 	 * numbers of pv entries.
 	 */
 	pvzone = uma_zcreate("PV ENTRY", sizeof(struct pv_entry), NULL, NULL,
 	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE);
 
 	TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
 	pv_entry_max = shpgperproc * maxproc + cnt.v_page_count;
 
 	TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
 	pv_entry_high_water = 9 * (pv_entry_max / 10);
 
-	uma_zone_set_obj(pvzone, &pvzone_obj, pv_entry_max);
+	uma_zone_reserve_kva(pvzone, pv_entry_max);
 
 	/* Pre-fill pvzone with initial number of pv entries. */
 	uma_prealloc(pvzone, PV_ENTRY_ZONE_MIN);
 
 	/* Initialize ptbl allocation. */
 	ptbl_init();
 }
 
 /*
  * Map a list of wired pages into kernel virtual address space.  This is
  * intended for temporary mappings which do not need page modification or
  * references recorded.  Existing mappings in the region are overwritten.
  */
 static void
 mmu_booke_qenter(mmu_t mmu, vm_offset_t sva, vm_page_t *m, int count)
 {
 	vm_offset_t va;
 
 	va = sva;
 	while (count-- > 0) {
 		mmu_booke_kenter(mmu, va, VM_PAGE_TO_PHYS(*m));
 		va += PAGE_SIZE;
 		m++;
 	}
 }
 
 /*
  * Remove page mappings from kernel virtual address space.  Intended for
  * temporary mappings entered by mmu_booke_qenter.
  */
 static void
 mmu_booke_qremove(mmu_t mmu, vm_offset_t sva, int count)
 {
 	vm_offset_t va;
 
 	va = sva;
 	while (count-- > 0) {
 		mmu_booke_kremove(mmu, va);
 		va += PAGE_SIZE;
 	}
 }
 
 /*
  * Map a wired page into kernel virtual address space.
  */
 static void
 mmu_booke_kenter(mmu_t mmu, vm_offset_t va, vm_paddr_t pa)
 {
 	unsigned int pdir_idx = PDIR_IDX(va);
 	unsigned int ptbl_idx = PTBL_IDX(va);
 	uint32_t flags;
 	pte_t *pte;
 
 	KASSERT(((va >= VM_MIN_KERNEL_ADDRESS) &&
 	    (va <= VM_MAX_KERNEL_ADDRESS)), ("mmu_booke_kenter: invalid va"));
 
 	flags = PTE_M | PTE_SR | PTE_SW | PTE_SX | PTE_WIRED | PTE_VALID;
 
 	pte = &(kernel_pmap->pm_pdir[pdir_idx][ptbl_idx]);
 
 	mtx_lock_spin(&tlbivax_mutex);
 	tlb_miss_lock();
 	
 	if (PTE_ISVALID(pte)) {
 	
 		CTR1(KTR_PMAP, "%s: replacing entry!", __func__);
 
 		/* Flush entry from TLB0 */
 		tlb0_flush_entry(va);
 	}
 
 	pte->rpn = pa & ~PTE_PA_MASK;
 	pte->flags = flags;
 
 	//debugf("mmu_booke_kenter: pdir_idx = %d ptbl_idx = %d va=0x%08x "
 	//		"pa=0x%08x rpn=0x%08x flags=0x%08x\n",
 	//		pdir_idx, ptbl_idx, va, pa, pte->rpn, pte->flags);
 
 	/* Flush the real memory from the instruction cache. */
 	if ((flags & (PTE_I | PTE_G)) == 0) {
 		__syncicache((void *)va, PAGE_SIZE);
 	}
 
 	tlb_miss_unlock();
 	mtx_unlock_spin(&tlbivax_mutex);
 }
 
 /*
  * Remove a page from kernel page table.
  */
 static void
 mmu_booke_kremove(mmu_t mmu, vm_offset_t va)
 {
 	unsigned int pdir_idx = PDIR_IDX(va);
 	unsigned int ptbl_idx = PTBL_IDX(va);
 	pte_t *pte;
 
 //	CTR2(KTR_PMAP,("%s: s (va = 0x%08x)\n", __func__, va));
 
 	KASSERT(((va >= VM_MIN_KERNEL_ADDRESS) &&
 	    (va <= VM_MAX_KERNEL_ADDRESS)),
 	    ("mmu_booke_kremove: invalid va"));
 
 	pte = &(kernel_pmap->pm_pdir[pdir_idx][ptbl_idx]);
 
 	if (!PTE_ISVALID(pte)) {
 	
 		CTR1(KTR_PMAP, "%s: invalid pte", __func__);
 
 		return;
 	}
 
 	mtx_lock_spin(&tlbivax_mutex);
 	tlb_miss_lock();
 
 	/* Invalidate entry in TLB0, update PTE. */
 	tlb0_flush_entry(va);
 	pte->flags = 0;
 	pte->rpn = 0;
 
 	tlb_miss_unlock();
 	mtx_unlock_spin(&tlbivax_mutex);
 }
 
 /*
  * Initialize pmap associated with process 0.
  */
 static void
 mmu_booke_pinit0(mmu_t mmu, pmap_t pmap)
 {
 
 	mmu_booke_pinit(mmu, pmap);
 	PCPU_SET(curpmap, pmap);
 }
 
 /*
  * Initialize a preallocated and zeroed pmap structure,
  * such as one in a vmspace structure.
  */
 static void
 mmu_booke_pinit(mmu_t mmu, pmap_t pmap)
 {
 	int i;
 
 	CTR4(KTR_PMAP, "%s: pmap = %p, proc %d '%s'", __func__, pmap,
 	    curthread->td_proc->p_pid, curthread->td_proc->p_comm);
 
 	KASSERT((pmap != kernel_pmap), ("pmap_pinit: initializing kernel_pmap"));
 
 	PMAP_LOCK_INIT(pmap);
 	for (i = 0; i < MAXCPU; i++)
 		pmap->pm_tid[i] = TID_NONE;
 	CPU_ZERO(&kernel_pmap->pm_active);
 	bzero(&pmap->pm_stats, sizeof(pmap->pm_stats));
 	bzero(&pmap->pm_pdir, sizeof(pte_t *) * PDIR_NENTRIES);
 	TAILQ_INIT(&pmap->pm_ptbl_list);
 }
 
 /*
  * Release any resources held by the given physical map.
  * Called when a pmap initialized by mmu_booke_pinit is being released.
  * Should only be called if the map contains no valid mappings.
  */
 static void
 mmu_booke_release(mmu_t mmu, pmap_t pmap)
 {
 
 	KASSERT(pmap->pm_stats.resident_count == 0,
 	    ("pmap_release: pmap resident count %ld != 0",
 	    pmap->pm_stats.resident_count));
 
 	PMAP_LOCK_DESTROY(pmap);
 }
 
 /*
  * Insert the given physical page at the specified virtual address in the
  * target physical map with the protection requested. If specified the page
  * will be wired down.
  */
 static void
 mmu_booke_enter(mmu_t mmu, pmap_t pmap, vm_offset_t va, vm_page_t m,
     vm_prot_t prot, boolean_t wired)
 {
 
 	rw_wlock(&pvh_global_lock);
 	PMAP_LOCK(pmap);
 	mmu_booke_enter_locked(mmu, pmap, va, m, prot, wired);
 	rw_wunlock(&pvh_global_lock);
 	PMAP_UNLOCK(pmap);
 }
 
 static void
 mmu_booke_enter_locked(mmu_t mmu, pmap_t pmap, vm_offset_t va, vm_page_t m,
     vm_prot_t prot, boolean_t wired)
 {
 	pte_t *pte;
 	vm_paddr_t pa;
 	uint32_t flags;
 	int su, sync;
 
 	pa = VM_PAGE_TO_PHYS(m);
 	su = (pmap == kernel_pmap);
 	sync = 0;
 
 	//debugf("mmu_booke_enter_locked: s (pmap=0x%08x su=%d tid=%d m=0x%08x va=0x%08x "
 	//		"pa=0x%08x prot=0x%08x wired=%d)\n",
 	//		(u_int32_t)pmap, su, pmap->pm_tid,
 	//		(u_int32_t)m, va, pa, prot, wired);
 
 	if (su) {
 		KASSERT(((va >= virtual_avail) &&
 		    (va <= VM_MAX_KERNEL_ADDRESS)),
 		    ("mmu_booke_enter_locked: kernel pmap, non kernel va"));
 	} else {
 		KASSERT((va <= VM_MAXUSER_ADDRESS),
 		    ("mmu_booke_enter_locked: user pmap, non user va"));
 	}
 	KASSERT((m->oflags & (VPO_UNMANAGED | VPO_BUSY)) != 0 ||
 	    VM_OBJECT_LOCKED(m->object),
 	    ("mmu_booke_enter_locked: page %p is not busy", m));
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	/*
 	 * If there is an existing mapping, and the physical address has not
 	 * changed, must be protection or wiring change.
 	 */
 	if (((pte = pte_find(mmu, pmap, va)) != NULL) &&
 	    (PTE_ISVALID(pte)) && (PTE_PA(pte) == pa)) {
 	    
 		/*
 		 * Before actually updating pte->flags we calculate and
 		 * prepare its new value in a helper var.
 		 */
 		flags = pte->flags;
 		flags &= ~(PTE_UW | PTE_UX | PTE_SW | PTE_SX | PTE_MODIFIED);
 
 		/* Wiring change, just update stats. */
 		if (wired) {
 			if (!PTE_ISWIRED(pte)) {
 				flags |= PTE_WIRED;
 				pmap->pm_stats.wired_count++;
 			}
 		} else {
 			if (PTE_ISWIRED(pte)) {
 				flags &= ~PTE_WIRED;
 				pmap->pm_stats.wired_count--;
 			}
 		}
 
 		if (prot & VM_PROT_WRITE) {
 			/* Add write permissions. */
 			flags |= PTE_SW;
 			if (!su)
 				flags |= PTE_UW;
 
 			if ((flags & PTE_MANAGED) != 0)
 				vm_page_aflag_set(m, PGA_WRITEABLE);
 		} else {
 			/* Handle modified pages, sense modify status. */
 
 			/*
 			 * The PTE_MODIFIED flag could be set by underlying
 			 * TLB misses since we last read it (above), possibly
 			 * other CPUs could update it so we check in the PTE
 			 * directly rather than rely on that saved local flags
 			 * copy.
 			 */
 			if (PTE_ISMODIFIED(pte))
 				vm_page_dirty(m);
 		}
 
 		if (prot & VM_PROT_EXECUTE) {
 			flags |= PTE_SX;
 			if (!su)
 				flags |= PTE_UX;
 
 			/*
 			 * Check existing flags for execute permissions: if we
 			 * are turning execute permissions on, icache should
 			 * be flushed.
 			 */
 			if ((pte->flags & (PTE_UX | PTE_SX)) == 0)
 				sync++;
 		}
 
 		flags &= ~PTE_REFERENCED;
 
 		/*
 		 * The new flags value is all calculated -- only now actually
 		 * update the PTE.
 		 */
 		mtx_lock_spin(&tlbivax_mutex);
 		tlb_miss_lock();
 
 		tlb0_flush_entry(va);
 		pte->flags = flags;
 
 		tlb_miss_unlock();
 		mtx_unlock_spin(&tlbivax_mutex);
 
 	} else {
 		/*
 		 * If there is an existing mapping, but it's for a different
 		 * physical address, pte_enter() will delete the old mapping.
 		 */
 		//if ((pte != NULL) && PTE_ISVALID(pte))
 		//	debugf("mmu_booke_enter_locked: replace\n");
 		//else
 		//	debugf("mmu_booke_enter_locked: new\n");
 
 		/* Now set up the flags and install the new mapping. */
 		flags = (PTE_SR | PTE_VALID);
 		flags |= PTE_M;
 
 		if (!su)
 			flags |= PTE_UR;
 
 		if (prot & VM_PROT_WRITE) {
 			flags |= PTE_SW;
 			if (!su)
 				flags |= PTE_UW;
 
 			if ((m->oflags & VPO_UNMANAGED) == 0)
 				vm_page_aflag_set(m, PGA_WRITEABLE);
 		}
 
 		if (prot & VM_PROT_EXECUTE) {
 			flags |= PTE_SX;
 			if (!su)
 				flags |= PTE_UX;
 		}
 
 		/* If its wired update stats. */
 		if (wired) {
 			pmap->pm_stats.wired_count++;
 			flags |= PTE_WIRED;
 		}
 
 		pte_enter(mmu, pmap, m, va, flags);
 
 		/* Flush the real memory from the instruction cache. */
 		if (prot & VM_PROT_EXECUTE)
 			sync++;
 	}
 
 	if (sync && (su || pmap == PCPU_GET(curpmap))) {
 		__syncicache((void *)va, PAGE_SIZE);
 		sync = 0;
 	}
 }
 
 /*
  * Maps a sequence of resident pages belonging to the same object.
  * The sequence begins with the given page m_start.  This page is
  * mapped at the given virtual address start.  Each subsequent page is
  * mapped at a virtual address that is offset from start by the same
  * amount as the page is offset from m_start within the object.  The
  * last page in the sequence is the page with the largest offset from
  * m_start that can be mapped at a virtual address less than the given
  * virtual address end.  Not every virtual page between start and end
  * is mapped; only those for which a resident page exists with the
  * corresponding offset from m_start are mapped.
  */
 static void
 mmu_booke_enter_object(mmu_t mmu, pmap_t pmap, vm_offset_t start,
     vm_offset_t end, vm_page_t m_start, vm_prot_t prot)
 {
 	vm_page_t m;
 	vm_pindex_t diff, psize;
 
 	psize = atop(end - start);
 	m = m_start;
 	rw_wlock(&pvh_global_lock);
 	PMAP_LOCK(pmap);
 	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
 		mmu_booke_enter_locked(mmu, pmap, start + ptoa(diff), m,
 		    prot & (VM_PROT_READ | VM_PROT_EXECUTE), FALSE);
 		m = TAILQ_NEXT(m, listq);
 	}
 	rw_wunlock(&pvh_global_lock);
 	PMAP_UNLOCK(pmap);
 }
 
 static void
 mmu_booke_enter_quick(mmu_t mmu, pmap_t pmap, vm_offset_t va, vm_page_t m,
     vm_prot_t prot)
 {
 
 	rw_wlock(&pvh_global_lock);
 	PMAP_LOCK(pmap);
 	mmu_booke_enter_locked(mmu, pmap, va, m,
 	    prot & (VM_PROT_READ | VM_PROT_EXECUTE), FALSE);
 	rw_wunlock(&pvh_global_lock);
 	PMAP_UNLOCK(pmap);
 }
 
 /*
  * Remove the given range of addresses from the specified map.
  *
  * It is assumed that the start and end are properly rounded to the page size.
  */
 static void
 mmu_booke_remove(mmu_t mmu, pmap_t pmap, vm_offset_t va, vm_offset_t endva)
 {
 	pte_t *pte;
 	uint8_t hold_flag;
 
 	int su = (pmap == kernel_pmap);
 
 	//debugf("mmu_booke_remove: s (su = %d pmap=0x%08x tid=%d va=0x%08x endva=0x%08x)\n",
 	//		su, (u_int32_t)pmap, pmap->pm_tid, va, endva);
 
 	if (su) {
 		KASSERT(((va >= virtual_avail) &&
 		    (va <= VM_MAX_KERNEL_ADDRESS)),
 		    ("mmu_booke_remove: kernel pmap, non kernel va"));
 	} else {
 		KASSERT((va <= VM_MAXUSER_ADDRESS),
 		    ("mmu_booke_remove: user pmap, non user va"));
 	}
 
 	if (PMAP_REMOVE_DONE(pmap)) {
 		//debugf("mmu_booke_remove: e (empty)\n");
 		return;
 	}
 
 	hold_flag = PTBL_HOLD_FLAG(pmap);
 	//debugf("mmu_booke_remove: hold_flag = %d\n", hold_flag);
 
 	rw_wlock(&pvh_global_lock);
 	PMAP_LOCK(pmap);
 	for (; va < endva; va += PAGE_SIZE) {
 		pte = pte_find(mmu, pmap, va);
 		if ((pte != NULL) && PTE_ISVALID(pte))
 			pte_remove(mmu, pmap, va, hold_flag);
 	}
 	PMAP_UNLOCK(pmap);
 	rw_wunlock(&pvh_global_lock);
 
 	//debugf("mmu_booke_remove: e\n");
 }
 
 /*
  * Remove physical page from all pmaps in which it resides.
  */
 static void
 mmu_booke_remove_all(mmu_t mmu, vm_page_t m)
 {
 	pv_entry_t pv, pvn;
 	uint8_t hold_flag;
 
 	rw_wlock(&pvh_global_lock);
 	for (pv = TAILQ_FIRST(&m->md.pv_list); pv != NULL; pv = pvn) {
 		pvn = TAILQ_NEXT(pv, pv_link);
 
 		PMAP_LOCK(pv->pv_pmap);
 		hold_flag = PTBL_HOLD_FLAG(pv->pv_pmap);
 		pte_remove(mmu, pv->pv_pmap, pv->pv_va, hold_flag);
 		PMAP_UNLOCK(pv->pv_pmap);
 	}
 	vm_page_aflag_clear(m, PGA_WRITEABLE);
 	rw_wunlock(&pvh_global_lock);
 }
 
 /*
  * Map a range of physical addresses into kernel virtual address space.
  */
 static vm_offset_t
 mmu_booke_map(mmu_t mmu, vm_offset_t *virt, vm_paddr_t pa_start,
     vm_paddr_t pa_end, int prot)
 {
 	vm_offset_t sva = *virt;
 	vm_offset_t va = sva;
 
 	//debugf("mmu_booke_map: s (sva = 0x%08x pa_start = 0x%08x pa_end = 0x%08x)\n",
 	//		sva, pa_start, pa_end);
 
 	while (pa_start < pa_end) {
 		mmu_booke_kenter(mmu, va, pa_start);
 		va += PAGE_SIZE;
 		pa_start += PAGE_SIZE;
 	}
 	*virt = va;
 
 	//debugf("mmu_booke_map: e (va = 0x%08x)\n", va);
 	return (sva);
 }
 
 /*
  * The pmap must be activated before it's address space can be accessed in any
  * way.
  */
 static void
 mmu_booke_activate(mmu_t mmu, struct thread *td)
 {
 	pmap_t pmap;
 	u_int cpuid;
 
 	pmap = &td->td_proc->p_vmspace->vm_pmap;
 
 	CTR5(KTR_PMAP, "%s: s (td = %p, proc = '%s', id = %d, pmap = 0x%08x)",
 	    __func__, td, td->td_proc->p_comm, td->td_proc->p_pid, pmap);
 
 	KASSERT((pmap != kernel_pmap), ("mmu_booke_activate: kernel_pmap!"));
 
 	mtx_lock_spin(&sched_lock);
 
 	cpuid = PCPU_GET(cpuid);
 	CPU_SET_ATOMIC(cpuid, &pmap->pm_active);
 	PCPU_SET(curpmap, pmap);
 	
 	if (pmap->pm_tid[cpuid] == TID_NONE)
 		tid_alloc(pmap);
 
 	/* Load PID0 register with pmap tid value. */
 	mtspr(SPR_PID0, pmap->pm_tid[cpuid]);
 	__asm __volatile("isync");
 
 	mtx_unlock_spin(&sched_lock);
 
 	CTR3(KTR_PMAP, "%s: e (tid = %d for '%s')", __func__,
 	    pmap->pm_tid[PCPU_GET(cpuid)], td->td_proc->p_comm);
 }
 
 /*
  * Deactivate the specified process's address space.
  */
 static void
 mmu_booke_deactivate(mmu_t mmu, struct thread *td)
 {
 	pmap_t pmap;
 
 	pmap = &td->td_proc->p_vmspace->vm_pmap;
 	
 	CTR5(KTR_PMAP, "%s: td=%p, proc = '%s', id = %d, pmap = 0x%08x",
 	    __func__, td, td->td_proc->p_comm, td->td_proc->p_pid, pmap);
 
 	CPU_CLR_ATOMIC(PCPU_GET(cpuid), &pmap->pm_active);
 	PCPU_SET(curpmap, NULL);
 }
 
 /*
  * Copy the range specified by src_addr/len
  * from the source map to the range dst_addr/len
  * in the destination map.
  *
  * This routine is only advisory and need not do anything.
  */
 static void
 mmu_booke_copy(mmu_t mmu, pmap_t dst_pmap, pmap_t src_pmap,
     vm_offset_t dst_addr, vm_size_t len, vm_offset_t src_addr)
 {
 
 }
 
 /*
  * Set the physical protection on the specified range of this map as requested.
  */
 static void
 mmu_booke_protect(mmu_t mmu, pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
     vm_prot_t prot)
 {
 	vm_offset_t va;
 	vm_page_t m;
 	pte_t *pte;
 
 	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
 		mmu_booke_remove(mmu, pmap, sva, eva);
 		return;
 	}
 
 	if (prot & VM_PROT_WRITE)
 		return;
 
 	PMAP_LOCK(pmap);
 	for (va = sva; va < eva; va += PAGE_SIZE) {
 		if ((pte = pte_find(mmu, pmap, va)) != NULL) {
 			if (PTE_ISVALID(pte)) {
 				m = PHYS_TO_VM_PAGE(PTE_PA(pte));
 
 				mtx_lock_spin(&tlbivax_mutex);
 				tlb_miss_lock();
 
 				/* Handle modified pages. */
 				if (PTE_ISMODIFIED(pte) && PTE_ISMANAGED(pte))
 					vm_page_dirty(m);
 
 				tlb0_flush_entry(va);
 				pte->flags &= ~(PTE_UW | PTE_SW | PTE_MODIFIED);
 
 				tlb_miss_unlock();
 				mtx_unlock_spin(&tlbivax_mutex);
 			}
 		}
 	}
 	PMAP_UNLOCK(pmap);
 }
 
 /*
  * Clear the write and modified bits in each of the given page's mappings.
  */
 static void
 mmu_booke_remove_write(mmu_t mmu, vm_page_t m)
 {
 	pv_entry_t pv;
 	pte_t *pte;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("mmu_booke_remove_write: page %p is not managed", m));
 
 	/*
 	 * If the page is not VPO_BUSY, then PGA_WRITEABLE cannot be set by
 	 * another thread while the object is locked.  Thus, if PGA_WRITEABLE
 	 * is clear, no page table entries need updating.
 	 */
 	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
 	if ((m->oflags & VPO_BUSY) == 0 &&
 	    (m->aflags & PGA_WRITEABLE) == 0)
 		return;
 	rw_wlock(&pvh_global_lock);
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) {
 		PMAP_LOCK(pv->pv_pmap);
 		if ((pte = pte_find(mmu, pv->pv_pmap, pv->pv_va)) != NULL) {
 			if (PTE_ISVALID(pte)) {
 				m = PHYS_TO_VM_PAGE(PTE_PA(pte));
 
 				mtx_lock_spin(&tlbivax_mutex);
 				tlb_miss_lock();
 
 				/* Handle modified pages. */
 				if (PTE_ISMODIFIED(pte))
 					vm_page_dirty(m);
 
 				/* Flush mapping from TLB0. */
 				pte->flags &= ~(PTE_UW | PTE_SW | PTE_MODIFIED);
 
 				tlb_miss_unlock();
 				mtx_unlock_spin(&tlbivax_mutex);
 			}
 		}
 		PMAP_UNLOCK(pv->pv_pmap);
 	}
 	vm_page_aflag_clear(m, PGA_WRITEABLE);
 	rw_wunlock(&pvh_global_lock);
 }
 
 static void
 mmu_booke_sync_icache(mmu_t mmu, pmap_t pm, vm_offset_t va, vm_size_t sz)
 {
 	pte_t *pte;
 	pmap_t pmap;
 	vm_page_t m;
 	vm_offset_t addr;
 	vm_paddr_t pa;
 	int active, valid;
  
 	va = trunc_page(va);
 	sz = round_page(sz);
 
 	rw_wlock(&pvh_global_lock);
 	pmap = PCPU_GET(curpmap);
 	active = (pm == kernel_pmap || pm == pmap) ? 1 : 0;
 	while (sz > 0) {
 		PMAP_LOCK(pm);
 		pte = pte_find(mmu, pm, va);
 		valid = (pte != NULL && PTE_ISVALID(pte)) ? 1 : 0;
 		if (valid)
 			pa = PTE_PA(pte);
 		PMAP_UNLOCK(pm);
 		if (valid) {
 			if (!active) {
 				/* Create a mapping in the active pmap. */
 				addr = 0;
 				m = PHYS_TO_VM_PAGE(pa);
 				PMAP_LOCK(pmap);
 				pte_enter(mmu, pmap, m, addr,
 				    PTE_SR | PTE_VALID | PTE_UR);
 				__syncicache((void *)addr, PAGE_SIZE);
 				pte_remove(mmu, pmap, addr, PTBL_UNHOLD);
 				PMAP_UNLOCK(pmap);
 			} else
 				__syncicache((void *)va, PAGE_SIZE);
 		}
 		va += PAGE_SIZE;
 		sz -= PAGE_SIZE;
 	}
 	rw_wunlock(&pvh_global_lock);
 }
 
 /*
  * Atomically extract and hold the physical page with the given
  * pmap and virtual address pair if that mapping permits the given
  * protection.
  */
 static vm_page_t
 mmu_booke_extract_and_hold(mmu_t mmu, pmap_t pmap, vm_offset_t va,
     vm_prot_t prot)
 {
 	pte_t *pte;
 	vm_page_t m;
 	uint32_t pte_wbit;
 	vm_paddr_t pa;
 	
 	m = NULL;
 	pa = 0;	
 	PMAP_LOCK(pmap);
 retry:
 	pte = pte_find(mmu, pmap, va);
 	if ((pte != NULL) && PTE_ISVALID(pte)) {
 		if (pmap == kernel_pmap)
 			pte_wbit = PTE_SW;
 		else
 			pte_wbit = PTE_UW;
 
 		if ((pte->flags & pte_wbit) || ((prot & VM_PROT_WRITE) == 0)) {
 			if (vm_page_pa_tryrelock(pmap, PTE_PA(pte), &pa))
 				goto retry;
 			m = PHYS_TO_VM_PAGE(PTE_PA(pte));
 			vm_page_hold(m);
 		}
 	}
 
 	PA_UNLOCK_COND(pa);
 	PMAP_UNLOCK(pmap);
 	return (m);
 }
 
 /*
  * Initialize a vm_page's machine-dependent fields.
  */
 static void
 mmu_booke_page_init(mmu_t mmu, vm_page_t m)
 {
 
 	TAILQ_INIT(&m->md.pv_list);
 }
 
 /*
  * mmu_booke_zero_page_area zeros the specified hardware page by
  * mapping it into virtual memory and using bzero to clear
  * its contents.
  *
  * off and size must reside within a single page.
  */
 static void
 mmu_booke_zero_page_area(mmu_t mmu, vm_page_t m, int off, int size)
 {
 	vm_offset_t va;
 
 	/* XXX KASSERT off and size are within a single page? */
 
 	mtx_lock(&zero_page_mutex);
 	va = zero_page_va;
 
 	mmu_booke_kenter(mmu, va, VM_PAGE_TO_PHYS(m));
 	bzero((caddr_t)va + off, size);
 	mmu_booke_kremove(mmu, va);
 
 	mtx_unlock(&zero_page_mutex);
 }
 
 /*
  * mmu_booke_zero_page zeros the specified hardware page.
  */
 static void
 mmu_booke_zero_page(mmu_t mmu, vm_page_t m)
 {
 
 	mmu_booke_zero_page_area(mmu, m, 0, PAGE_SIZE);
 }
 
 /*
  * mmu_booke_copy_page copies the specified (machine independent) page by
  * mapping the page into virtual memory and using memcopy to copy the page,
  * one machine dependent page at a time.
  */
 static void
 mmu_booke_copy_page(mmu_t mmu, vm_page_t sm, vm_page_t dm)
 {
 	vm_offset_t sva, dva;
 
 	sva = copy_page_src_va;
 	dva = copy_page_dst_va;
 
 	mtx_lock(&copy_page_mutex);
 	mmu_booke_kenter(mmu, sva, VM_PAGE_TO_PHYS(sm));
 	mmu_booke_kenter(mmu, dva, VM_PAGE_TO_PHYS(dm));
 	memcpy((caddr_t)dva, (caddr_t)sva, PAGE_SIZE);
 	mmu_booke_kremove(mmu, dva);
 	mmu_booke_kremove(mmu, sva);
 	mtx_unlock(&copy_page_mutex);
 }
 
 /*
  * mmu_booke_zero_page_idle zeros the specified hardware page by mapping it
  * into virtual memory and using bzero to clear its contents. This is intended
  * to be called from the vm_pagezero process only and outside of Giant. No
  * lock is required.
  */
 static void
 mmu_booke_zero_page_idle(mmu_t mmu, vm_page_t m)
 {
 	vm_offset_t va;
 
 	va = zero_page_idle_va;
 	mmu_booke_kenter(mmu, va, VM_PAGE_TO_PHYS(m));
 	bzero((caddr_t)va, PAGE_SIZE);
 	mmu_booke_kremove(mmu, va);
 }
 
 /*
  * Return whether or not the specified physical page was modified
  * in any of physical maps.
  */
 static boolean_t
 mmu_booke_is_modified(mmu_t mmu, vm_page_t m)
 {
 	pte_t *pte;
 	pv_entry_t pv;
 	boolean_t rv;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("mmu_booke_is_modified: page %p is not managed", m));
 	rv = FALSE;
 
 	/*
 	 * If the page is not VPO_BUSY, then PGA_WRITEABLE cannot be
 	 * concurrently set while the object is locked.  Thus, if PGA_WRITEABLE
 	 * is clear, no PTEs can be modified.
 	 */
 	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
 	if ((m->oflags & VPO_BUSY) == 0 &&
 	    (m->aflags & PGA_WRITEABLE) == 0)
 		return (rv);
 	rw_wlock(&pvh_global_lock);
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) {
 		PMAP_LOCK(pv->pv_pmap);
 		if ((pte = pte_find(mmu, pv->pv_pmap, pv->pv_va)) != NULL &&
 		    PTE_ISVALID(pte)) {
 			if (PTE_ISMODIFIED(pte))
 				rv = TRUE;
 		}
 		PMAP_UNLOCK(pv->pv_pmap);
 		if (rv)
 			break;
 	}
 	rw_wunlock(&pvh_global_lock);
 	return (rv);
 }
 
 /*
  * Return whether or not the specified virtual address is eligible
  * for prefault.
  */
 static boolean_t
 mmu_booke_is_prefaultable(mmu_t mmu, pmap_t pmap, vm_offset_t addr)
 {
 
 	return (FALSE);
 }
 
 /*
  * Return whether or not the specified physical page was referenced
  * in any physical maps.
  */
 static boolean_t
 mmu_booke_is_referenced(mmu_t mmu, vm_page_t m)
 {
 	pte_t *pte;
 	pv_entry_t pv;
 	boolean_t rv;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("mmu_booke_is_referenced: page %p is not managed", m));
 	rv = FALSE;
 	rw_wlock(&pvh_global_lock);
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) {
 		PMAP_LOCK(pv->pv_pmap);
 		if ((pte = pte_find(mmu, pv->pv_pmap, pv->pv_va)) != NULL &&
 		    PTE_ISVALID(pte)) {
 			if (PTE_ISREFERENCED(pte))
 				rv = TRUE;
 		}
 		PMAP_UNLOCK(pv->pv_pmap);
 		if (rv)
 			break;
 	}
 	rw_wunlock(&pvh_global_lock);
 	return (rv);
 }
 
 /*
  * Clear the modify bits on the specified physical page.
  */
 static void
 mmu_booke_clear_modify(mmu_t mmu, vm_page_t m)
 {
 	pte_t *pte;
 	pv_entry_t pv;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("mmu_booke_clear_modify: page %p is not managed", m));
 	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
 	KASSERT((m->oflags & VPO_BUSY) == 0,
 	    ("mmu_booke_clear_modify: page %p is busy", m));
 
 	/*
 	 * If the page is not PG_AWRITEABLE, then no PTEs can be modified.
 	 * If the object containing the page is locked and the page is not
 	 * VPO_BUSY, then PG_AWRITEABLE cannot be concurrently set.
 	 */
 	if ((m->aflags & PGA_WRITEABLE) == 0)
 		return;
 	rw_wlock(&pvh_global_lock);
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) {
 		PMAP_LOCK(pv->pv_pmap);
 		if ((pte = pte_find(mmu, pv->pv_pmap, pv->pv_va)) != NULL &&
 		    PTE_ISVALID(pte)) {
 			mtx_lock_spin(&tlbivax_mutex);
 			tlb_miss_lock();
 			
 			if (pte->flags & (PTE_SW | PTE_UW | PTE_MODIFIED)) {
 				tlb0_flush_entry(pv->pv_va);
 				pte->flags &= ~(PTE_SW | PTE_UW | PTE_MODIFIED |
 				    PTE_REFERENCED);
 			}
 
 			tlb_miss_unlock();
 			mtx_unlock_spin(&tlbivax_mutex);
 		}
 		PMAP_UNLOCK(pv->pv_pmap);
 	}
 	rw_wunlock(&pvh_global_lock);
 }
 
 /*
  * Return a count of reference bits for a page, clearing those bits.
  * It is not necessary for every reference bit to be cleared, but it
  * is necessary that 0 only be returned when there are truly no
  * reference bits set.
  *
  * XXX: The exact number of bits to check and clear is a matter that
  * should be tested and standardized at some point in the future for
  * optimal aging of shared pages.
  */
 static int
 mmu_booke_ts_referenced(mmu_t mmu, vm_page_t m)
 {
 	pte_t *pte;
 	pv_entry_t pv;
 	int count;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("mmu_booke_ts_referenced: page %p is not managed", m));
 	count = 0;
 	rw_wlock(&pvh_global_lock);
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) {
 		PMAP_LOCK(pv->pv_pmap);
 		if ((pte = pte_find(mmu, pv->pv_pmap, pv->pv_va)) != NULL &&
 		    PTE_ISVALID(pte)) {
 			if (PTE_ISREFERENCED(pte)) {
 				mtx_lock_spin(&tlbivax_mutex);
 				tlb_miss_lock();
 
 				tlb0_flush_entry(pv->pv_va);
 				pte->flags &= ~PTE_REFERENCED;
 
 				tlb_miss_unlock();
 				mtx_unlock_spin(&tlbivax_mutex);
 
 				if (++count > 4) {
 					PMAP_UNLOCK(pv->pv_pmap);
 					break;
 				}
 			}
 		}
 		PMAP_UNLOCK(pv->pv_pmap);
 	}
 	rw_wunlock(&pvh_global_lock);
 	return (count);
 }
 
 /*
  * Clear the reference bit on the specified physical page.
  */
 static void
 mmu_booke_clear_reference(mmu_t mmu, vm_page_t m)
 {
 	pte_t *pte;
 	pv_entry_t pv;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("mmu_booke_clear_reference: page %p is not managed", m));
 	rw_wlock(&pvh_global_lock);
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) {
 		PMAP_LOCK(pv->pv_pmap);
 		if ((pte = pte_find(mmu, pv->pv_pmap, pv->pv_va)) != NULL &&
 		    PTE_ISVALID(pte)) {
 			if (PTE_ISREFERENCED(pte)) {
 				mtx_lock_spin(&tlbivax_mutex);
 				tlb_miss_lock();
 				
 				tlb0_flush_entry(pv->pv_va);
 				pte->flags &= ~PTE_REFERENCED;
 
 				tlb_miss_unlock();
 				mtx_unlock_spin(&tlbivax_mutex);
 			}
 		}
 		PMAP_UNLOCK(pv->pv_pmap);
 	}
 	rw_wunlock(&pvh_global_lock);
 }
 
 /*
  * Change wiring attribute for a map/virtual-address pair.
  */
 static void
 mmu_booke_change_wiring(mmu_t mmu, pmap_t pmap, vm_offset_t va, boolean_t wired)
 {
 	pte_t *pte;
 
 	PMAP_LOCK(pmap);
 	if ((pte = pte_find(mmu, pmap, va)) != NULL) {
 		if (wired) {
 			if (!PTE_ISWIRED(pte)) {
 				pte->flags |= PTE_WIRED;
 				pmap->pm_stats.wired_count++;
 			}
 		} else {
 			if (PTE_ISWIRED(pte)) {
 				pte->flags &= ~PTE_WIRED;
 				pmap->pm_stats.wired_count--;
 			}
 		}
 	}
 	PMAP_UNLOCK(pmap);
 }
 
 /*
  * Return true if the pmap's pv is one of the first 16 pvs linked to from this
  * page.  This count may be changed upwards or downwards in the future; it is
  * only necessary that true be returned for a small subset of pmaps for proper
  * page aging.
  */
 static boolean_t
 mmu_booke_page_exists_quick(mmu_t mmu, pmap_t pmap, vm_page_t m)
 {
 	pv_entry_t pv;
 	int loops;
 	boolean_t rv;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("mmu_booke_page_exists_quick: page %p is not managed", m));
 	loops = 0;
 	rv = FALSE;
 	rw_wlock(&pvh_global_lock);
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) {
 		if (pv->pv_pmap == pmap) {
 			rv = TRUE;
 			break;
 		}
 		if (++loops >= 16)
 			break;
 	}
 	rw_wunlock(&pvh_global_lock);
 	return (rv);
 }
 
 /*
  * Return the number of managed mappings to the given physical page that are
  * wired.
  */
 static int
 mmu_booke_page_wired_mappings(mmu_t mmu, vm_page_t m)
 {
 	pv_entry_t pv;
 	pte_t *pte;
 	int count = 0;
 
 	if ((m->oflags & VPO_UNMANAGED) != 0)
 		return (count);
 	rw_wlock(&pvh_global_lock);
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) {
 		PMAP_LOCK(pv->pv_pmap);
 		if ((pte = pte_find(mmu, pv->pv_pmap, pv->pv_va)) != NULL)
 			if (PTE_ISVALID(pte) && PTE_ISWIRED(pte))
 				count++;
 		PMAP_UNLOCK(pv->pv_pmap);
 	}
 	rw_wunlock(&pvh_global_lock);
 	return (count);
 }
 
 static int
 mmu_booke_dev_direct_mapped(mmu_t mmu, vm_paddr_t pa, vm_size_t size)
 {
 	int i;
 	vm_offset_t va;
 
 	/*
 	 * This currently does not work for entries that
 	 * overlap TLB1 entries.
 	 */
 	for (i = 0; i < tlb1_idx; i ++) {
 		if (tlb1_iomapped(i, pa, size, &va) == 0)
 			return (0);
 	}
 
 	return (EFAULT);
 }
 
 vm_offset_t
 mmu_booke_dumpsys_map(mmu_t mmu, struct pmap_md *md, vm_size_t ofs,
     vm_size_t *sz)
 {
 	vm_paddr_t pa, ppa;
 	vm_offset_t va;
 	vm_size_t gran;
 
 	/* Raw physical memory dumps don't have a virtual address. */
 	if (md->md_vaddr == ~0UL) {
 		/* We always map a 256MB page at 256M. */
 		gran = 256 * 1024 * 1024;
 		pa = md->md_paddr + ofs;
 		ppa = pa & ~(gran - 1);
 		ofs = pa - ppa;
 		va = gran;
 		tlb1_set_entry(va, ppa, gran, _TLB_ENTRY_IO);
 		if (*sz > (gran - ofs))
 			*sz = gran - ofs;
 		return (va + ofs);
 	}
 
 	/* Minidumps are based on virtual memory addresses. */
 	va = md->md_vaddr + ofs;
 	if (va >= kernstart + kernsize) {
 		gran = PAGE_SIZE - (va & PAGE_MASK);
 		if (*sz > gran)
 			*sz = gran;
 	}
 	return (va);
 }
 
 void
 mmu_booke_dumpsys_unmap(mmu_t mmu, struct pmap_md *md, vm_size_t ofs,
     vm_offset_t va)
 {
 
 	/* Raw physical memory dumps don't have a virtual address. */
 	if (md->md_vaddr == ~0UL) {
 		tlb1_idx--;
 		tlb1[tlb1_idx].mas1 = 0;
 		tlb1[tlb1_idx].mas2 = 0;
 		tlb1[tlb1_idx].mas3 = 0;
 		tlb1_write_entry(tlb1_idx);
 		return;
 	}
  
 	/* Minidumps are based on virtual memory addresses. */
 	/* Nothing to do... */
 }
 
 struct pmap_md *
 mmu_booke_scan_md(mmu_t mmu, struct pmap_md *prev)
 {
 	static struct pmap_md md;
 	pte_t *pte;
 	vm_offset_t va;
  
 	if (dumpsys_minidump) {
 		md.md_paddr = ~0UL;	/* Minidumps use virtual addresses. */
 		if (prev == NULL) {
 			/* 1st: kernel .data and .bss. */
 			md.md_index = 1;
 			md.md_vaddr = trunc_page((uintptr_t)_etext);
 			md.md_size = round_page((uintptr_t)_end) - md.md_vaddr;
 			return (&md);
 		}
 		switch (prev->md_index) {
 		case 1:
 			/* 2nd: msgbuf and tables (see pmap_bootstrap()). */
 			md.md_index = 2;
 			md.md_vaddr = data_start;
 			md.md_size = data_end - data_start;
 			break;
 		case 2:
 			/* 3rd: kernel VM. */
 			va = prev->md_vaddr + prev->md_size;
 			/* Find start of next chunk (from va). */
 			while (va < virtual_end) {
 				/* Don't dump the buffer cache. */
 				if (va >= kmi.buffer_sva &&
 				    va < kmi.buffer_eva) {
 					va = kmi.buffer_eva;
 					continue;
 				}
 				pte = pte_find(mmu, kernel_pmap, va);
 				if (pte != NULL && PTE_ISVALID(pte))
 					break;
 				va += PAGE_SIZE;
 			}
 			if (va < virtual_end) {
 				md.md_vaddr = va;
 				va += PAGE_SIZE;
 				/* Find last page in chunk. */
 				while (va < virtual_end) {
 					/* Don't run into the buffer cache. */
 					if (va == kmi.buffer_sva)
 						break;
 					pte = pte_find(mmu, kernel_pmap, va);
 					if (pte == NULL || !PTE_ISVALID(pte))
 						break;
 					va += PAGE_SIZE;
 				}
 				md.md_size = va - md.md_vaddr;
 				break;
 			}
 			md.md_index = 3;
 			/* FALLTHROUGH */
 		default:
 			return (NULL);
 		}
 	} else { /* minidumps */
 		mem_regions(&physmem_regions, &physmem_regions_sz,
 		    &availmem_regions, &availmem_regions_sz);
 
 		if (prev == NULL) {
 			/* first physical chunk. */
 			md.md_paddr = physmem_regions[0].mr_start;
 			md.md_size = physmem_regions[0].mr_size;
 			md.md_vaddr = ~0UL;
 			md.md_index = 1;
 		} else if (md.md_index < physmem_regions_sz) {
 			md.md_paddr = physmem_regions[md.md_index].mr_start;
 			md.md_size = physmem_regions[md.md_index].mr_size;
 			md.md_vaddr = ~0UL;
 			md.md_index++;
 		} else {
 			/* There's no next physical chunk. */
 			return (NULL);
 		}
 	}
 
 	return (&md);
 }
 
 /*
  * Map a set of physical memory pages into the kernel virtual address space.
  * Return a pointer to where it is mapped. This routine is intended to be used
  * for mapping device memory, NOT real memory.
  */
 static void *
 mmu_booke_mapdev(mmu_t mmu, vm_paddr_t pa, vm_size_t size)
 {
 	void *res;
 	uintptr_t va;
 	vm_size_t sz;
 
 	/*
 	 * CCSR is premapped. Note that (pa + size - 1) is there to make sure
 	 * we don't wrap around. Devices on the local bus typically extend all
 	 * the way up to and including 0xffffffff. In that case (pa + size)
 	 * would be 0. This creates a false positive (i.e. we think it's
 	 * within the CCSR) and not create a mapping.
 	 */
 	if (pa >= ccsrbar_pa && (pa + size - 1) < (ccsrbar_pa + CCSRBAR_SIZE)) {
 		va = CCSRBAR_VA + (pa - ccsrbar_pa);
 		return ((void *)va);
 	}
 
 	va = (pa >= 0x80000000) ? pa : (0xe2000000 + pa);
 	res = (void *)va;
 
 	do {
 		sz = 1 << (ilog2(size) & ~1);
 		if (bootverbose)
 			printf("Wiring VA=%x to PA=%x (size=%x), "
 			    "using TLB1[%d]\n", va, pa, sz, tlb1_idx);
 		tlb1_set_entry(va, pa, sz, _TLB_ENTRY_IO);
 		size -= sz;
 		pa += sz;
 		va += sz;
 	} while (size > 0);
 
 	return (res);
 }
 
 /*
  * 'Unmap' a range mapped by mmu_booke_mapdev().
  */
 static void
 mmu_booke_unmapdev(mmu_t mmu, vm_offset_t va, vm_size_t size)
 {
 	vm_offset_t base, offset;
 
 	/*
 	 * Unmap only if this is inside kernel virtual space.
 	 */
 	if ((va >= VM_MIN_KERNEL_ADDRESS) && (va <= VM_MAX_KERNEL_ADDRESS)) {
 		base = trunc_page(va);
 		offset = va & PAGE_MASK;
 		size = roundup(offset + size, PAGE_SIZE);
 		kmem_free(kernel_map, base, size);
 	}
 }
 
 /*
  * mmu_booke_object_init_pt preloads the ptes for a given object into the
  * specified pmap. This eliminates the blast of soft faults on process startup
  * and immediately after an mmap.
  */
 static void
 mmu_booke_object_init_pt(mmu_t mmu, pmap_t pmap, vm_offset_t addr,
     vm_object_t object, vm_pindex_t pindex, vm_size_t size)
 {
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
 	    ("mmu_booke_object_init_pt: non-device object"));
 }
 
 /*
  * Perform the pmap work for mincore.
  */
 static int
 mmu_booke_mincore(mmu_t mmu, pmap_t pmap, vm_offset_t addr,
     vm_paddr_t *locked_pa)
 {
 
 	TODO;
 	return (0);
 }
 
 /**************************************************************************/
 /* TID handling */
 /**************************************************************************/
 
 /*
  * Allocate a TID. If necessary, steal one from someone else.
  * The new TID is flushed from the TLB before returning.
  */
 static tlbtid_t
 tid_alloc(pmap_t pmap)
 {
 	tlbtid_t tid;
 	int thiscpu;
 
 	KASSERT((pmap != kernel_pmap), ("tid_alloc: kernel pmap"));
 
 	CTR2(KTR_PMAP, "%s: s (pmap = %p)", __func__, pmap);
 
 	thiscpu = PCPU_GET(cpuid);
 
 	tid = PCPU_GET(tid_next);
 	if (tid > TID_MAX)
 		tid = TID_MIN;
 	PCPU_SET(tid_next, tid + 1);
 
 	/* If we are stealing TID then clear the relevant pmap's field */
 	if (tidbusy[thiscpu][tid] != NULL) {
 
 		CTR2(KTR_PMAP, "%s: warning: stealing tid %d", __func__, tid);
 		
 		tidbusy[thiscpu][tid]->pm_tid[thiscpu] = TID_NONE;
 
 		/* Flush all entries from TLB0 matching this TID. */
 		tid_flush(tid);
 	}
 
 	tidbusy[thiscpu][tid] = pmap;
 	pmap->pm_tid[thiscpu] = tid;
 	__asm __volatile("msync; isync");
 
 	CTR3(KTR_PMAP, "%s: e (%02d next = %02d)", __func__, tid,
 	    PCPU_GET(tid_next));
 
 	return (tid);
 }
 
 /**************************************************************************/
 /* TLB0 handling */
 /**************************************************************************/
 
 static void
 tlb_print_entry(int i, uint32_t mas1, uint32_t mas2, uint32_t mas3,
     uint32_t mas7)
 {
 	int as;
 	char desc[3];
 	tlbtid_t tid;
 	vm_size_t size;
 	unsigned int tsize;
 
 	desc[2] = '\0';
 	if (mas1 & MAS1_VALID)
 		desc[0] = 'V';
 	else
 		desc[0] = ' ';
 
 	if (mas1 & MAS1_IPROT)
 		desc[1] = 'P';
 	else
 		desc[1] = ' ';
 
 	as = (mas1 & MAS1_TS_MASK) ? 1 : 0;
 	tid = MAS1_GETTID(mas1);
 
 	tsize = (mas1 & MAS1_TSIZE_MASK) >> MAS1_TSIZE_SHIFT;
 	size = 0;
 	if (tsize)
 		size = tsize2size(tsize);
 
 	debugf("%3d: (%s) [AS=%d] "
 	    "sz = 0x%08x tsz = %d tid = %d mas1 = 0x%08x "
 	    "mas2(va) = 0x%08x mas3(pa) = 0x%08x mas7 = 0x%08x\n",
 	    i, desc, as, size, tsize, tid, mas1, mas2, mas3, mas7);
 }
 
 /* Convert TLB0 va and way number to tlb0[] table index. */
 static inline unsigned int
 tlb0_tableidx(vm_offset_t va, unsigned int way)
 {
 	unsigned int idx;
 
 	idx = (way * TLB0_ENTRIES_PER_WAY);
 	idx += (va & MAS2_TLB0_ENTRY_IDX_MASK) >> MAS2_TLB0_ENTRY_IDX_SHIFT;
 	return (idx);
 }
 
 /*
  * Invalidate TLB0 entry.
  */
 static inline void
 tlb0_flush_entry(vm_offset_t va)
 {
 
 	CTR2(KTR_PMAP, "%s: s va=0x%08x", __func__, va);
 
 	mtx_assert(&tlbivax_mutex, MA_OWNED);
 
 	__asm __volatile("tlbivax 0, %0" :: "r"(va & MAS2_EPN_MASK));
 	__asm __volatile("isync; msync");
 	__asm __volatile("tlbsync; msync");
 
 	CTR1(KTR_PMAP, "%s: e", __func__);
 }
 
 /* Print out contents of the MAS registers for each TLB0 entry */
 void
 tlb0_print_tlbentries(void)
 {
 	uint32_t mas0, mas1, mas2, mas3, mas7;
 	int entryidx, way, idx;
 
 	debugf("TLB0 entries:\n");
 	for (way = 0; way < TLB0_WAYS; way ++)
 		for (entryidx = 0; entryidx < TLB0_ENTRIES_PER_WAY; entryidx++) {
 
 			mas0 = MAS0_TLBSEL(0) | MAS0_ESEL(way);
 			mtspr(SPR_MAS0, mas0);
 			__asm __volatile("isync");
 
 			mas2 = entryidx << MAS2_TLB0_ENTRY_IDX_SHIFT;
 			mtspr(SPR_MAS2, mas2);
 
 			__asm __volatile("isync; tlbre");
 
 			mas1 = mfspr(SPR_MAS1);
 			mas2 = mfspr(SPR_MAS2);
 			mas3 = mfspr(SPR_MAS3);
 			mas7 = mfspr(SPR_MAS7);
 
 			idx = tlb0_tableidx(mas2, way);
 			tlb_print_entry(idx, mas1, mas2, mas3, mas7);
 		}
 }
 
 /**************************************************************************/
 /* TLB1 handling */
 /**************************************************************************/
 
 /*
  * TLB1 mapping notes:
  *
  * TLB1[0]	CCSRBAR
  * TLB1[1]	Kernel text and data.
  * TLB1[2-15]	Additional kernel text and data mappings (if required), PCI
  *		windows, other devices mappings.
  */
 
 /*
  * Write given entry to TLB1 hardware.
  * Use 32 bit pa, clear 4 high-order bits of RPN (mas7).
  */
 static void
 tlb1_write_entry(unsigned int idx)
 {
 	uint32_t mas0, mas7;
 
 	//debugf("tlb1_write_entry: s\n");
 
 	/* Clear high order RPN bits */
 	mas7 = 0;
 
 	/* Select entry */
 	mas0 = MAS0_TLBSEL(1) | MAS0_ESEL(idx);
 	//debugf("tlb1_write_entry: mas0 = 0x%08x\n", mas0);
 
 	mtspr(SPR_MAS0, mas0);
 	__asm __volatile("isync");
 	mtspr(SPR_MAS1, tlb1[idx].mas1);
 	__asm __volatile("isync");
 	mtspr(SPR_MAS2, tlb1[idx].mas2);
 	__asm __volatile("isync");
 	mtspr(SPR_MAS3, tlb1[idx].mas3);
 	__asm __volatile("isync");
 	mtspr(SPR_MAS7, mas7);
 	__asm __volatile("isync; tlbwe; isync; msync");
 
 	//debugf("tlb1_write_entry: e\n");
 }
 
 /*
  * Return the largest uint value log such that 2^log <= num.
  */
 static unsigned int
 ilog2(unsigned int num)
 {
 	int lz;
 
 	__asm ("cntlzw %0, %1" : "=r" (lz) : "r" (num));
 	return (31 - lz);
 }
 
 /*
  * Convert TLB TSIZE value to mapped region size.
  */
 static vm_size_t
 tsize2size(unsigned int tsize)
 {
 
 	/*
 	 * size = 4^tsize KB
 	 * size = 4^tsize * 2^10 = 2^(2 * tsize - 10)
 	 */
 
 	return ((1 << (2 * tsize)) * 1024);
 }
 
 /*
  * Convert region size (must be power of 4) to TLB TSIZE value.
  */
 static unsigned int
 size2tsize(vm_size_t size)
 {
 
 	return (ilog2(size) / 2 - 5);
 }
 
 /*
  * Register permanent kernel mapping in TLB1.
  *
  * Entries are created starting from index 0 (current free entry is
  * kept in tlb1_idx) and are not supposed to be invalidated.
  */
 static int
 tlb1_set_entry(vm_offset_t va, vm_offset_t pa, vm_size_t size,
     uint32_t flags)
 {
 	uint32_t ts, tid;
 	int tsize;
 	
 	if (tlb1_idx >= TLB1_ENTRIES) {
 		printf("tlb1_set_entry: TLB1 full!\n");
 		return (-1);
 	}
 
 	/* Convert size to TSIZE */
 	tsize = size2tsize(size);
 
 	tid = (TID_KERNEL << MAS1_TID_SHIFT) & MAS1_TID_MASK;
 	/* XXX TS is hard coded to 0 for now as we only use single address space */
 	ts = (0 << MAS1_TS_SHIFT) & MAS1_TS_MASK;
 
 	/* XXX LOCK tlb1[] */
 
 	tlb1[tlb1_idx].mas1 = MAS1_VALID | MAS1_IPROT | ts | tid;
 	tlb1[tlb1_idx].mas1 |= ((tsize << MAS1_TSIZE_SHIFT) & MAS1_TSIZE_MASK);
 	tlb1[tlb1_idx].mas2 = (va & MAS2_EPN_MASK) | flags;
 
 	/* Set supervisor RWX permission bits */
 	tlb1[tlb1_idx].mas3 = (pa & MAS3_RPN) | MAS3_SR | MAS3_SW | MAS3_SX;
 
 	tlb1_write_entry(tlb1_idx++);
 
 	/* XXX UNLOCK tlb1[] */
 
 	/*
 	 * XXX in general TLB1 updates should be propagated between CPUs,
 	 * since current design assumes to have the same TLB1 set-up on all
 	 * cores.
 	 */
 	return (0);
 }
 
 /*
  * Map in contiguous RAM region into the TLB1 using maximum of
  * KERNEL_REGION_MAX_TLB_ENTRIES entries.
  *
  * If necessary round up last entry size and return total size
  * used by all allocated entries.
  */
 vm_size_t
 tlb1_mapin_region(vm_offset_t va, vm_paddr_t pa, vm_size_t size)
 {
 	vm_size_t pgs[KERNEL_REGION_MAX_TLB_ENTRIES];
 	vm_size_t mapped, pgsz, base, mask;
 	int idx, nents;
 
 	/* Round up to the next 1M */
 	size = (size + (1 << 20) - 1) & ~((1 << 20) - 1);
 
 	mapped = 0;
 	idx = 0;
 	base = va;
 	pgsz = 64*1024*1024;
 	while (mapped < size) {
 		while (mapped < size && idx < KERNEL_REGION_MAX_TLB_ENTRIES) {
 			while (pgsz > (size - mapped))
 				pgsz >>= 2;
 			pgs[idx++] = pgsz;
 			mapped += pgsz;
 		}
 
 		/* We under-map. Correct for this. */
 		if (mapped < size) {
 			while (pgs[idx - 1] == pgsz) {
 				idx--;
 				mapped -= pgsz;
 			}
 			/* XXX We may increase beyond out starting point. */
 			pgsz <<= 2;
 			pgs[idx++] = pgsz;
 			mapped += pgsz;
 		}
 	}
 
 	nents = idx;
 	mask = pgs[0] - 1;
 	/* Align address to the boundary */
 	if (va & mask) {
 		va = (va + mask) & ~mask;
 		pa = (pa + mask) & ~mask;
 	}
 
 	for (idx = 0; idx < nents; idx++) {
 		pgsz = pgs[idx];
 		debugf("%u: %x -> %x, size=%x\n", idx, pa, va, pgsz);
 		tlb1_set_entry(va, pa, pgsz, _TLB_ENTRY_MEM);
 		pa += pgsz;
 		va += pgsz;
 	}
 
 	mapped = (va - base);
 	debugf("mapped size 0x%08x (wasted space 0x%08x)\n",
 	    mapped, mapped - size);
 	return (mapped);
 }
 
 /*
  * TLB1 initialization routine, to be called after the very first
  * assembler level setup done in locore.S.
  */
 void
 tlb1_init(vm_offset_t ccsrbar)
 {
 	uint32_t mas0, mas1, mas3;
 	uint32_t tsz;
 	u_int i;
 
 	ccsrbar_pa = ccsrbar;
 
 	if (bootinfo != NULL && bootinfo[0] != 1) {
 		tlb1_idx = *((uint16_t *)(bootinfo + 8));
 	} else
 		tlb1_idx = 1;
 
 	/* The first entry/entries are used to map the kernel. */
 	for (i = 0; i < tlb1_idx; i++) {
 		mas0 = MAS0_TLBSEL(1) | MAS0_ESEL(i);
 		mtspr(SPR_MAS0, mas0);
 		__asm __volatile("isync; tlbre");
 
 		mas1 = mfspr(SPR_MAS1);
 		if ((mas1 & MAS1_VALID) == 0)
 			continue;
 
 		mas3 = mfspr(SPR_MAS3);
 
 		tlb1[i].mas1 = mas1;
 		tlb1[i].mas2 = mfspr(SPR_MAS2);
 		tlb1[i].mas3 = mas3;
 
 		if (i == 0)
 			kernload = mas3 & MAS3_RPN;
 
 		tsz = (mas1 & MAS1_TSIZE_MASK) >> MAS1_TSIZE_SHIFT;
 		kernsize += (tsz > 0) ? tsize2size(tsz) : 0;
 	}
 
 	/* Map in CCSRBAR. */
 	tlb1_set_entry(CCSRBAR_VA, ccsrbar, CCSRBAR_SIZE, _TLB_ENTRY_IO);
 
 #ifdef SMP
 	bp_ntlb1s = tlb1_idx;
 #endif
 
 	/* Purge the remaining entries */
 	for (i = tlb1_idx; i < TLB1_ENTRIES; i++)
 		tlb1_write_entry(i);
 
 	/* Setup TLB miss defaults */
 	set_mas4_defaults();
 }
 
 /*
  * Setup MAS4 defaults.
  * These values are loaded to MAS0-2 on a TLB miss.
  */
 static void
 set_mas4_defaults(void)
 {
 	uint32_t mas4;
 
 	/* Defaults: TLB0, PID0, TSIZED=4K */
 	mas4 = MAS4_TLBSELD0;
 	mas4 |= (TLB_SIZE_4K << MAS4_TSIZED_SHIFT) & MAS4_TSIZED_MASK;
 #ifdef SMP
 	mas4 |= MAS4_MD;
 #endif
 	mtspr(SPR_MAS4, mas4);
 	__asm __volatile("isync");
 }
 
 /*
  * Print out contents of the MAS registers for each TLB1 entry
  */
 void
 tlb1_print_tlbentries(void)
 {
 	uint32_t mas0, mas1, mas2, mas3, mas7;
 	int i;
 
 	debugf("TLB1 entries:\n");
 	for (i = 0; i < TLB1_ENTRIES; i++) {
 
 		mas0 = MAS0_TLBSEL(1) | MAS0_ESEL(i);
 		mtspr(SPR_MAS0, mas0);
 
 		__asm __volatile("isync; tlbre");
 
 		mas1 = mfspr(SPR_MAS1);
 		mas2 = mfspr(SPR_MAS2);
 		mas3 = mfspr(SPR_MAS3);
 		mas7 = mfspr(SPR_MAS7);
 
 		tlb_print_entry(i, mas1, mas2, mas3, mas7);
 	}
 }
 
 /*
  * Print out contents of the in-ram tlb1 table.
  */
 void
 tlb1_print_entries(void)
 {
 	int i;
 
 	debugf("tlb1[] table entries:\n");
 	for (i = 0; i < TLB1_ENTRIES; i++)
 		tlb_print_entry(i, tlb1[i].mas1, tlb1[i].mas2, tlb1[i].mas3, 0);
 }
 
 /*
  * Return 0 if the physical IO range is encompassed by one of the
  * the TLB1 entries, otherwise return related error code.
  */
 static int
 tlb1_iomapped(int i, vm_paddr_t pa, vm_size_t size, vm_offset_t *va)
 {
 	uint32_t prot;
 	vm_paddr_t pa_start;
 	vm_paddr_t pa_end;
 	unsigned int entry_tsize;
 	vm_size_t entry_size;
 
 	*va = (vm_offset_t)NULL;
 
 	/* Skip invalid entries */
 	if (!(tlb1[i].mas1 & MAS1_VALID))
 		return (EINVAL);
 
 	/*
 	 * The entry must be cache-inhibited, guarded, and r/w
 	 * so it can function as an i/o page
 	 */
 	prot = tlb1[i].mas2 & (MAS2_I | MAS2_G);
 	if (prot != (MAS2_I | MAS2_G))
 		return (EPERM);
 
 	prot = tlb1[i].mas3 & (MAS3_SR | MAS3_SW);
 	if (prot != (MAS3_SR | MAS3_SW))
 		return (EPERM);
 
 	/* The address should be within the entry range. */
 	entry_tsize = (tlb1[i].mas1 & MAS1_TSIZE_MASK) >> MAS1_TSIZE_SHIFT;
 	KASSERT((entry_tsize), ("tlb1_iomapped: invalid entry tsize"));
 
 	entry_size = tsize2size(entry_tsize);
 	pa_start = tlb1[i].mas3 & MAS3_RPN;
 	pa_end = pa_start + entry_size - 1;
 
 	if ((pa < pa_start) || ((pa + size) > pa_end))
 		return (ERANGE);
 
 	/* Return virtual address of this mapping. */
 	*va = (tlb1[i].mas2 & MAS2_EPN_MASK) + (pa - pa_start);
 	return (0);
 }
Index: user/attilio/vmc-playground/sys/vm/swap_pager.c
===================================================================
--- user/attilio/vmc-playground/sys/vm/swap_pager.c	(revision 247223)
+++ user/attilio/vmc-playground/sys/vm/swap_pager.c	(revision 247224)
@@ -1,2726 +1,2725 @@
 /*-
  * Copyright (c) 1998 Matthew Dillon,
  * Copyright (c) 1994 John S. Dyson
  * Copyright (c) 1990 University of Utah.
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the Systems Programming Group of the University of Utah Computer
  * Science Department.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *				New Swap System
  *				Matthew Dillon
  *
  * Radix Bitmap 'blists'.
  *
  *	- The new swapper uses the new radix bitmap code.  This should scale
  *	  to arbitrarily small or arbitrarily large swap spaces and an almost
  *	  arbitrary degree of fragmentation.
  *
  * Features:
  *
  *	- on the fly reallocation of swap during putpages.  The new system
  *	  does not try to keep previously allocated swap blocks for dirty
  *	  pages.
  *
  *	- on the fly deallocation of swap
  *
  *	- No more garbage collection required.  Unnecessarily allocated swap
  *	  blocks only exist for dirty vm_page_t's now and these are already
  *	  cycled (in a high-load system) by the pager.  We also do on-the-fly
  *	  removal of invalidated swap blocks when a page is destroyed
  *	  or renamed.
  *
  * from: Utah $Hdr: swap_pager.c 1.4 91/04/30$
  *
  *	@(#)swap_pager.c	8.9 (Berkeley) 3/21/94
  *	@(#)vm_swap.c	8.5 (Berkeley) 2/17/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_swap.h"
 #include "opt_vm.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/kernel.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/disk.h>
 #include <sys/fcntl.h>
 #include <sys/mount.h>
 #include <sys/namei.h>
 #include <sys/vnode.h>
 #include <sys/malloc.h>
 #include <sys/racct.h>
 #include <sys/resource.h>
 #include <sys/resourcevar.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/blist.h>
 #include <sys/lock.h>
 #include <sys/sx.h>
 #include <sys/vmmeter.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_param.h>
 #include <vm/swap_pager.h>
 #include <vm/vm_extern.h>
 #include <vm/uma.h>
 
 #include <geom/geom.h>
 
 /*
  * SWB_NPAGES must be a power of 2.  It may be set to 1, 2, 4, 8, 16
  * or 32 pages per allocation.
  * The 32-page limit is due to the radix code (kern/subr_blist.c).
  */
 #ifndef MAX_PAGEOUT_CLUSTER
 #define MAX_PAGEOUT_CLUSTER 16
 #endif
 
 #if !defined(SWB_NPAGES)
 #define SWB_NPAGES	MAX_PAGEOUT_CLUSTER
 #endif
 
 /*
  * The swblock structure maps an object and a small, fixed-size range
  * of page indices to disk addresses within a swap area.
  * The collection of these mappings is implemented as a hash table.
  * Unused disk addresses within a swap area are allocated and managed
  * using a blist.
  */
 #define SWCORRECT(n) (sizeof(void *) * (n) / sizeof(daddr_t))
 #define SWAP_META_PAGES		(SWB_NPAGES * 2)
 #define SWAP_META_MASK		(SWAP_META_PAGES - 1)
 
 struct swblock {
 	struct swblock	*swb_hnext;
 	vm_object_t	swb_object;
 	vm_pindex_t	swb_index;
 	int		swb_count;
 	daddr_t		swb_pages[SWAP_META_PAGES];
 };
 
 static MALLOC_DEFINE(M_VMPGDATA, "vm_pgdata", "swap pager private data");
 static struct mtx sw_dev_mtx;
 static TAILQ_HEAD(, swdevt) swtailq = TAILQ_HEAD_INITIALIZER(swtailq);
 static struct swdevt *swdevhd;	/* Allocate from here next */
 static int nswapdev;		/* Number of swap devices */
 int swap_pager_avail;
 static int swdev_syscall_active = 0; /* serialize swap(on|off) */
 
 static vm_ooffset_t swap_total;
 SYSCTL_QUAD(_vm, OID_AUTO, swap_total, CTLFLAG_RD, &swap_total, 0,
     "Total amount of available swap storage.");
 static vm_ooffset_t swap_reserved;
 SYSCTL_QUAD(_vm, OID_AUTO, swap_reserved, CTLFLAG_RD, &swap_reserved, 0,
     "Amount of swap storage needed to back all allocated anonymous memory.");
 static int overcommit = 0;
 SYSCTL_INT(_vm, OID_AUTO, overcommit, CTLFLAG_RW, &overcommit, 0,
     "Configure virtual memory overcommit behavior. See tuning(7) "
     "for details.");
 
 /* bits from overcommit */
 #define	SWAP_RESERVE_FORCE_ON		(1 << 0)
 #define	SWAP_RESERVE_RLIMIT_ON		(1 << 1)
 #define	SWAP_RESERVE_ALLOW_NONWIRED	(1 << 2)
 
 int
 swap_reserve(vm_ooffset_t incr)
 {
 
 	return (swap_reserve_by_cred(incr, curthread->td_ucred));
 }
 
 int
 swap_reserve_by_cred(vm_ooffset_t incr, struct ucred *cred)
 {
 	vm_ooffset_t r, s;
 	int res, error;
 	static int curfail;
 	static struct timeval lastfail;
 	struct uidinfo *uip;
 
 	uip = cred->cr_ruidinfo;
 
 	if (incr & PAGE_MASK)
 		panic("swap_reserve: & PAGE_MASK");
 
 #ifdef RACCT
 	PROC_LOCK(curproc);
 	error = racct_add(curproc, RACCT_SWAP, incr);
 	PROC_UNLOCK(curproc);
 	if (error != 0)
 		return (0);
 #endif
 
 	res = 0;
 	mtx_lock(&sw_dev_mtx);
 	r = swap_reserved + incr;
 	if (overcommit & SWAP_RESERVE_ALLOW_NONWIRED) {
 		s = cnt.v_page_count - cnt.v_free_reserved - cnt.v_wire_count;
 		s *= PAGE_SIZE;
 	} else
 		s = 0;
 	s += swap_total;
 	if ((overcommit & SWAP_RESERVE_FORCE_ON) == 0 || r <= s ||
 	    (error = priv_check(curthread, PRIV_VM_SWAP_NOQUOTA)) == 0) {
 		res = 1;
 		swap_reserved = r;
 	}
 	mtx_unlock(&sw_dev_mtx);
 
 	if (res) {
 		PROC_LOCK(curproc);
 		UIDINFO_VMSIZE_LOCK(uip);
 		if ((overcommit & SWAP_RESERVE_RLIMIT_ON) != 0 &&
 		    uip->ui_vmsize + incr > lim_cur(curproc, RLIMIT_SWAP) &&
 		    priv_check(curthread, PRIV_VM_SWAP_NORLIMIT))
 			res = 0;
 		else
 			uip->ui_vmsize += incr;
 		UIDINFO_VMSIZE_UNLOCK(uip);
 		PROC_UNLOCK(curproc);
 		if (!res) {
 			mtx_lock(&sw_dev_mtx);
 			swap_reserved -= incr;
 			mtx_unlock(&sw_dev_mtx);
 		}
 	}
 	if (!res && ppsratecheck(&lastfail, &curfail, 1)) {
 		printf("uid %d, pid %d: swap reservation for %jd bytes failed\n",
 		    uip->ui_uid, curproc->p_pid, incr);
 	}
 
 #ifdef RACCT
 	if (!res) {
 		PROC_LOCK(curproc);
 		racct_sub(curproc, RACCT_SWAP, incr);
 		PROC_UNLOCK(curproc);
 	}
 #endif
 
 	return (res);
 }
 
 void
 swap_reserve_force(vm_ooffset_t incr)
 {
 	struct uidinfo *uip;
 
 	mtx_lock(&sw_dev_mtx);
 	swap_reserved += incr;
 	mtx_unlock(&sw_dev_mtx);
 
 #ifdef RACCT
 	PROC_LOCK(curproc);
 	racct_add_force(curproc, RACCT_SWAP, incr);
 	PROC_UNLOCK(curproc);
 #endif
 
 	uip = curthread->td_ucred->cr_ruidinfo;
 	PROC_LOCK(curproc);
 	UIDINFO_VMSIZE_LOCK(uip);
 	uip->ui_vmsize += incr;
 	UIDINFO_VMSIZE_UNLOCK(uip);
 	PROC_UNLOCK(curproc);
 }
 
 void
 swap_release(vm_ooffset_t decr)
 {
 	struct ucred *cred;
 
 	PROC_LOCK(curproc);
 	cred = curthread->td_ucred;
 	swap_release_by_cred(decr, cred);
 	PROC_UNLOCK(curproc);
 }
 
 void
 swap_release_by_cred(vm_ooffset_t decr, struct ucred *cred)
 {
  	struct uidinfo *uip;
 
 	uip = cred->cr_ruidinfo;
 
 	if (decr & PAGE_MASK)
 		panic("swap_release: & PAGE_MASK");
 
 	mtx_lock(&sw_dev_mtx);
 	if (swap_reserved < decr)
 		panic("swap_reserved < decr");
 	swap_reserved -= decr;
 	mtx_unlock(&sw_dev_mtx);
 
 	UIDINFO_VMSIZE_LOCK(uip);
 	if (uip->ui_vmsize < decr)
 		printf("negative vmsize for uid = %d\n", uip->ui_uid);
 	uip->ui_vmsize -= decr;
 	UIDINFO_VMSIZE_UNLOCK(uip);
 
 	racct_sub_cred(cred, RACCT_SWAP, decr);
 }
 
 static void swapdev_strategy(struct buf *, struct swdevt *sw);
 
 #define SWM_FREE	0x02	/* free, period			*/
 #define SWM_POP		0x04	/* pop out			*/
 
 int swap_pager_full = 2;	/* swap space exhaustion (task killing) */
 static int swap_pager_almost_full = 1; /* swap space exhaustion (w/hysteresis)*/
 static int nsw_rcount;		/* free read buffers			*/
 static int nsw_wcount_sync;	/* limit write buffers / synchronous	*/
 static int nsw_wcount_async;	/* limit write buffers / asynchronous	*/
 static int nsw_wcount_async_max;/* assigned maximum			*/
 static int nsw_cluster_max;	/* maximum VOP I/O allowed		*/
 
 static struct swblock **swhash;
 static int swhash_mask;
 static struct mtx swhash_mtx;
 
 static int swap_async_max = 4;	/* maximum in-progress async I/O's	*/
 static struct sx sw_alloc_sx;
 
 
 SYSCTL_INT(_vm, OID_AUTO, swap_async_max,
 	CTLFLAG_RW, &swap_async_max, 0, "Maximum running async swap ops");
 
 /*
  * "named" and "unnamed" anon region objects.  Try to reduce the overhead
  * of searching a named list by hashing it just a little.
  */
 
 #define NOBJLISTS		8
 
 #define NOBJLIST(handle)	\
 	(&swap_pager_object_list[((int)(intptr_t)handle >> 4) & (NOBJLISTS-1)])
 
 static struct mtx sw_alloc_mtx;	/* protect list manipulation */
 static struct pagerlst	swap_pager_object_list[NOBJLISTS];
 static uma_zone_t	swap_zone;
-static struct vm_object	swap_zone_obj;
 
 /*
  * pagerops for OBJT_SWAP - "swap pager".  Some ops are also global procedure
  * calls hooked from other parts of the VM system and do not appear here.
  * (see vm/swap_pager.h).
  */
 static vm_object_t
 		swap_pager_alloc(void *handle, vm_ooffset_t size,
 		    vm_prot_t prot, vm_ooffset_t offset, struct ucred *);
 static void	swap_pager_dealloc(vm_object_t object);
 static int	swap_pager_getpages(vm_object_t, vm_page_t *, int, int);
 static void	swap_pager_putpages(vm_object_t, vm_page_t *, int, boolean_t, int *);
 static boolean_t
 		swap_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, int *after);
 static void	swap_pager_init(void);
 static void	swap_pager_unswapped(vm_page_t);
 static void	swap_pager_swapoff(struct swdevt *sp);
 
 struct pagerops swappagerops = {
 	.pgo_init =	swap_pager_init,	/* early system initialization of pager	*/
 	.pgo_alloc =	swap_pager_alloc,	/* allocate an OBJT_SWAP object		*/
 	.pgo_dealloc =	swap_pager_dealloc,	/* deallocate an OBJT_SWAP object	*/
 	.pgo_getpages =	swap_pager_getpages,	/* pagein				*/
 	.pgo_putpages =	swap_pager_putpages,	/* pageout				*/
 	.pgo_haspage =	swap_pager_haspage,	/* get backing store status for page	*/
 	.pgo_pageunswapped = swap_pager_unswapped,	/* remove swap related to page		*/
 };
 
 /*
  * dmmax is in page-sized chunks with the new swap system.  It was
  * dev-bsized chunks in the old.  dmmax is always a power of 2.
  *
  * swap_*() routines are externally accessible.  swp_*() routines are
  * internal.
  */
 static int dmmax;
 static int nswap_lowat = 128;	/* in pages, swap_pager_almost_full warn */
 static int nswap_hiwat = 512;	/* in pages, swap_pager_almost_full warn */
 
 SYSCTL_INT(_vm, OID_AUTO, dmmax,
 	CTLFLAG_RD, &dmmax, 0, "Maximum size of a swap block");
 
 static void	swp_sizecheck(void);
 static void	swp_pager_async_iodone(struct buf *bp);
 static int	swapongeom(struct thread *, struct vnode *);
 static int	swaponvp(struct thread *, struct vnode *, u_long);
 static int	swapoff_one(struct swdevt *sp, struct ucred *cred);
 
 /*
  * Swap bitmap functions
  */
 static void	swp_pager_freeswapspace(daddr_t blk, int npages);
 static daddr_t	swp_pager_getswapspace(int npages);
 
 /*
  * Metadata functions
  */
 static struct swblock **swp_pager_hash(vm_object_t object, vm_pindex_t index);
 static void swp_pager_meta_build(vm_object_t, vm_pindex_t, daddr_t);
 static void swp_pager_meta_free(vm_object_t, vm_pindex_t, daddr_t);
 static void swp_pager_meta_free_all(vm_object_t);
 static daddr_t swp_pager_meta_ctl(vm_object_t, vm_pindex_t, int);
 
 static void
 swp_pager_free_nrpage(vm_page_t m)
 {
 
 	vm_page_lock(m);
 	if (m->wire_count == 0)
 		vm_page_free(m);
 	vm_page_unlock(m);
 }
 
 /*
  * SWP_SIZECHECK() -	update swap_pager_full indication
  *
  *	update the swap_pager_almost_full indication and warn when we are
  *	about to run out of swap space, using lowat/hiwat hysteresis.
  *
  *	Clear swap_pager_full ( task killing ) indication when lowat is met.
  *
  *	No restrictions on call
  *	This routine may not block.
  */
 static void
 swp_sizecheck(void)
 {
 
 	if (swap_pager_avail < nswap_lowat) {
 		if (swap_pager_almost_full == 0) {
 			printf("swap_pager: out of swap space\n");
 			swap_pager_almost_full = 1;
 		}
 	} else {
 		swap_pager_full = 0;
 		if (swap_pager_avail > nswap_hiwat)
 			swap_pager_almost_full = 0;
 	}
 }
 
 /*
  * SWP_PAGER_HASH() -	hash swap meta data
  *
  *	This is an helper function which hashes the swapblk given
  *	the object and page index.  It returns a pointer to a pointer
  *	to the object, or a pointer to a NULL pointer if it could not
  *	find a swapblk.
  */
 static struct swblock **
 swp_pager_hash(vm_object_t object, vm_pindex_t index)
 {
 	struct swblock **pswap;
 	struct swblock *swap;
 
 	index &= ~(vm_pindex_t)SWAP_META_MASK;
 	pswap = &swhash[(index ^ (int)(intptr_t)object) & swhash_mask];
 	while ((swap = *pswap) != NULL) {
 		if (swap->swb_object == object &&
 		    swap->swb_index == index
 		) {
 			break;
 		}
 		pswap = &swap->swb_hnext;
 	}
 	return (pswap);
 }
 
 /*
  * SWAP_PAGER_INIT() -	initialize the swap pager!
  *
  *	Expected to be started from system init.  NOTE:  This code is run
  *	before much else so be careful what you depend on.  Most of the VM
  *	system has yet to be initialized at this point.
  */
 static void
 swap_pager_init(void)
 {
 	/*
 	 * Initialize object lists
 	 */
 	int i;
 
 	for (i = 0; i < NOBJLISTS; ++i)
 		TAILQ_INIT(&swap_pager_object_list[i]);
 	mtx_init(&sw_alloc_mtx, "swap_pager list", NULL, MTX_DEF);
 	mtx_init(&sw_dev_mtx, "swapdev", NULL, MTX_DEF);
 
 	/*
 	 * Device Stripe, in PAGE_SIZE'd blocks
 	 */
 	dmmax = SWB_NPAGES * 2;
 }
 
 /*
  * SWAP_PAGER_SWAP_INIT() - swap pager initialization from pageout process
  *
  *	Expected to be started from pageout process once, prior to entering
  *	its main loop.
  */
 void
 swap_pager_swap_init(void)
 {
 	int n, n2;
 
 	/*
 	 * Number of in-transit swap bp operations.  Don't
 	 * exhaust the pbufs completely.  Make sure we
 	 * initialize workable values (0 will work for hysteresis
 	 * but it isn't very efficient).
 	 *
 	 * The nsw_cluster_max is constrained by the bp->b_pages[]
 	 * array (MAXPHYS/PAGE_SIZE) and our locally defined
 	 * MAX_PAGEOUT_CLUSTER.   Also be aware that swap ops are
 	 * constrained by the swap device interleave stripe size.
 	 *
 	 * Currently we hardwire nsw_wcount_async to 4.  This limit is
 	 * designed to prevent other I/O from having high latencies due to
 	 * our pageout I/O.  The value 4 works well for one or two active swap
 	 * devices but is probably a little low if you have more.  Even so,
 	 * a higher value would probably generate only a limited improvement
 	 * with three or four active swap devices since the system does not
 	 * typically have to pageout at extreme bandwidths.   We will want
 	 * at least 2 per swap devices, and 4 is a pretty good value if you
 	 * have one NFS swap device due to the command/ack latency over NFS.
 	 * So it all works out pretty well.
 	 */
 	nsw_cluster_max = min((MAXPHYS/PAGE_SIZE), MAX_PAGEOUT_CLUSTER);
 
 	mtx_lock(&pbuf_mtx);
 	nsw_rcount = (nswbuf + 1) / 2;
 	nsw_wcount_sync = (nswbuf + 3) / 4;
 	nsw_wcount_async = 4;
 	nsw_wcount_async_max = nsw_wcount_async;
 	mtx_unlock(&pbuf_mtx);
 
 	/*
 	 * Initialize our zone.  Right now I'm just guessing on the number
 	 * we need based on the number of pages in the system.  Each swblock
 	 * can hold 16 pages, so this is probably overkill.  This reservation
 	 * is typically limited to around 32MB by default.
 	 */
 	n = cnt.v_page_count / 2;
 	if (maxswzone && n > maxswzone / sizeof(struct swblock))
 		n = maxswzone / sizeof(struct swblock);
 	n2 = n;
 	swap_zone = uma_zcreate("SWAPMETA", sizeof(struct swblock), NULL, NULL,
 	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE | UMA_ZONE_VM);
 	if (swap_zone == NULL)
 		panic("failed to create swap_zone.");
 	do {
-		if (uma_zone_set_obj(swap_zone, &swap_zone_obj, n))
+		if (uma_zone_reserve_kva(swap_zone, n))
 			break;
 		/*
 		 * if the allocation failed, try a zone two thirds the
 		 * size of the previous attempt.
 		 */
 		n -= ((n + 2) / 3);
 	} while (n > 0);
 	if (n2 != n)
 		printf("Swap zone entries reduced from %d to %d.\n", n2, n);
 	n2 = n;
 
 	/*
 	 * Initialize our meta-data hash table.  The swapper does not need to
 	 * be quite as efficient as the VM system, so we do not use an
 	 * oversized hash table.
 	 *
 	 * 	n: 		size of hash table, must be power of 2
 	 *	swhash_mask:	hash table index mask
 	 */
 	for (n = 1; n < n2 / 8; n *= 2)
 		;
 	swhash = malloc(sizeof(struct swblock *) * n, M_VMPGDATA, M_WAITOK | M_ZERO);
 	swhash_mask = n - 1;
 	mtx_init(&swhash_mtx, "swap_pager swhash", NULL, MTX_DEF);
 }
 
 /*
  * SWAP_PAGER_ALLOC() -	allocate a new OBJT_SWAP VM object and instantiate
  *			its metadata structures.
  *
  *	This routine is called from the mmap and fork code to create a new
  *	OBJT_SWAP object.  We do this by creating an OBJT_DEFAULT object
  *	and then converting it with swp_pager_meta_build().
  *
  *	This routine may block in vm_object_allocate() and create a named
  *	object lookup race, so we must interlock.
  *
  * MPSAFE
  */
 static vm_object_t
 swap_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot,
     vm_ooffset_t offset, struct ucred *cred)
 {
 	vm_object_t object;
 	vm_pindex_t pindex;
 
 	pindex = OFF_TO_IDX(offset + PAGE_MASK + size);
 	if (handle) {
 		mtx_lock(&Giant);
 		/*
 		 * Reference existing named region or allocate new one.  There
 		 * should not be a race here against swp_pager_meta_build()
 		 * as called from vm_page_remove() in regards to the lookup
 		 * of the handle.
 		 */
 		sx_xlock(&sw_alloc_sx);
 		object = vm_pager_object_lookup(NOBJLIST(handle), handle);
 		if (object == NULL) {
 			if (cred != NULL) {
 				if (!swap_reserve_by_cred(size, cred)) {
 					sx_xunlock(&sw_alloc_sx);
 					mtx_unlock(&Giant);
 					return (NULL);
 				}
 				crhold(cred);
 			}
 			object = vm_object_allocate(OBJT_DEFAULT, pindex);
 			VM_OBJECT_LOCK(object);
 			object->handle = handle;
 			if (cred != NULL) {
 				object->cred = cred;
 				object->charge = size;
 			}
 			swp_pager_meta_build(object, 0, SWAPBLK_NONE);
 			VM_OBJECT_UNLOCK(object);
 		}
 		sx_xunlock(&sw_alloc_sx);
 		mtx_unlock(&Giant);
 	} else {
 		if (cred != NULL) {
 			if (!swap_reserve_by_cred(size, cred))
 				return (NULL);
 			crhold(cred);
 		}
 		object = vm_object_allocate(OBJT_DEFAULT, pindex);
 		VM_OBJECT_LOCK(object);
 		if (cred != NULL) {
 			object->cred = cred;
 			object->charge = size;
 		}
 		swp_pager_meta_build(object, 0, SWAPBLK_NONE);
 		VM_OBJECT_UNLOCK(object);
 	}
 	return (object);
 }
 
 /*
  * SWAP_PAGER_DEALLOC() -	remove swap metadata from object
  *
  *	The swap backing for the object is destroyed.  The code is
  *	designed such that we can reinstantiate it later, but this
  *	routine is typically called only when the entire object is
  *	about to be destroyed.
  *
  *	The object must be locked.
  */
 static void
 swap_pager_dealloc(vm_object_t object)
 {
 
 	/*
 	 * Remove from list right away so lookups will fail if we block for
 	 * pageout completion.
 	 */
 	if (object->handle != NULL) {
 		mtx_lock(&sw_alloc_mtx);
 		TAILQ_REMOVE(NOBJLIST(object->handle), object, pager_object_list);
 		mtx_unlock(&sw_alloc_mtx);
 	}
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	vm_object_pip_wait(object, "swpdea");
 
 	/*
 	 * Free all remaining metadata.  We only bother to free it from
 	 * the swap meta data.  We do not attempt to free swapblk's still
 	 * associated with vm_page_t's for this object.  We do not care
 	 * if paging is still in progress on some objects.
 	 */
 	swp_pager_meta_free_all(object);
 }
 
 /************************************************************************
  *			SWAP PAGER BITMAP ROUTINES			*
  ************************************************************************/
 
 /*
  * SWP_PAGER_GETSWAPSPACE() -	allocate raw swap space
  *
  *	Allocate swap for the requested number of pages.  The starting
  *	swap block number (a page index) is returned or SWAPBLK_NONE
  *	if the allocation failed.
  *
  *	Also has the side effect of advising that somebody made a mistake
  *	when they configured swap and didn't configure enough.
  *
  *	This routine may not sleep.
  *
  *	We allocate in round-robin fashion from the configured devices.
  */
 static daddr_t
 swp_pager_getswapspace(int npages)
 {
 	daddr_t blk;
 	struct swdevt *sp;
 	int i;
 
 	blk = SWAPBLK_NONE;
 	mtx_lock(&sw_dev_mtx);
 	sp = swdevhd;
 	for (i = 0; i < nswapdev; i++) {
 		if (sp == NULL)
 			sp = TAILQ_FIRST(&swtailq);
 		if (!(sp->sw_flags & SW_CLOSING)) {
 			blk = blist_alloc(sp->sw_blist, npages);
 			if (blk != SWAPBLK_NONE) {
 				blk += sp->sw_first;
 				sp->sw_used += npages;
 				swap_pager_avail -= npages;
 				swp_sizecheck();
 				swdevhd = TAILQ_NEXT(sp, sw_list);
 				goto done;
 			}
 		}
 		sp = TAILQ_NEXT(sp, sw_list);
 	}
 	if (swap_pager_full != 2) {
 		printf("swap_pager_getswapspace(%d): failed\n", npages);
 		swap_pager_full = 2;
 		swap_pager_almost_full = 1;
 	}
 	swdevhd = NULL;
 done:
 	mtx_unlock(&sw_dev_mtx);
 	return (blk);
 }
 
 static int
 swp_pager_isondev(daddr_t blk, struct swdevt *sp)
 {
 
 	return (blk >= sp->sw_first && blk < sp->sw_end);
 }
 
 static void
 swp_pager_strategy(struct buf *bp)
 {
 	struct swdevt *sp;
 
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_FOREACH(sp, &swtailq, sw_list) {
 		if (bp->b_blkno >= sp->sw_first && bp->b_blkno < sp->sw_end) {
 			mtx_unlock(&sw_dev_mtx);
 			sp->sw_strategy(bp, sp);
 			return;
 		}
 	}
 	panic("Swapdev not found");
 }
 
 
 /*
  * SWP_PAGER_FREESWAPSPACE() -	free raw swap space
  *
  *	This routine returns the specified swap blocks back to the bitmap.
  *
  *	This routine may not sleep.
  */
 static void
 swp_pager_freeswapspace(daddr_t blk, int npages)
 {
 	struct swdevt *sp;
 
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_FOREACH(sp, &swtailq, sw_list) {
 		if (blk >= sp->sw_first && blk < sp->sw_end) {
 			sp->sw_used -= npages;
 			/*
 			 * If we are attempting to stop swapping on
 			 * this device, we don't want to mark any
 			 * blocks free lest they be reused.
 			 */
 			if ((sp->sw_flags & SW_CLOSING) == 0) {
 				blist_free(sp->sw_blist, blk - sp->sw_first,
 				    npages);
 				swap_pager_avail += npages;
 				swp_sizecheck();
 			}
 			mtx_unlock(&sw_dev_mtx);
 			return;
 		}
 	}
 	panic("Swapdev not found");
 }
 
 /*
  * SWAP_PAGER_FREESPACE() -	frees swap blocks associated with a page
  *				range within an object.
  *
  *	This is a globally accessible routine.
  *
  *	This routine removes swapblk assignments from swap metadata.
  *
  *	The external callers of this routine typically have already destroyed
  *	or renamed vm_page_t's associated with this range in the object so
  *	we should be ok.
  */
 void
 swap_pager_freespace(vm_object_t object, vm_pindex_t start, vm_size_t size)
 {
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	swp_pager_meta_free(object, start, size);
 }
 
 /*
  * SWAP_PAGER_RESERVE() - reserve swap blocks in object
  *
  *	Assigns swap blocks to the specified range within the object.  The
  *	swap blocks are not zerod.  Any previous swap assignment is destroyed.
  *
  *	Returns 0 on success, -1 on failure.
  */
 int
 swap_pager_reserve(vm_object_t object, vm_pindex_t start, vm_size_t size)
 {
 	int n = 0;
 	daddr_t blk = SWAPBLK_NONE;
 	vm_pindex_t beg = start;	/* save start index */
 
 	VM_OBJECT_LOCK(object);
 	while (size) {
 		if (n == 0) {
 			n = BLIST_MAX_ALLOC;
 			while ((blk = swp_pager_getswapspace(n)) == SWAPBLK_NONE) {
 				n >>= 1;
 				if (n == 0) {
 					swp_pager_meta_free(object, beg, start - beg);
 					VM_OBJECT_UNLOCK(object);
 					return (-1);
 				}
 			}
 		}
 		swp_pager_meta_build(object, start, blk);
 		--size;
 		++start;
 		++blk;
 		--n;
 	}
 	swp_pager_meta_free(object, start, n);
 	VM_OBJECT_UNLOCK(object);
 	return (0);
 }
 
 /*
  * SWAP_PAGER_COPY() -  copy blocks from source pager to destination pager
  *			and destroy the source.
  *
  *	Copy any valid swapblks from the source to the destination.  In
  *	cases where both the source and destination have a valid swapblk,
  *	we keep the destination's.
  *
  *	This routine is allowed to sleep.  It may sleep allocating metadata
  *	indirectly through swp_pager_meta_build() or if paging is still in
  *	progress on the source.
  *
  *	The source object contains no vm_page_t's (which is just as well)
  *
  *	The source object is of type OBJT_SWAP.
  *
  *	The source and destination objects must be locked.
  *	Both object locks may temporarily be released.
  */
 void
 swap_pager_copy(vm_object_t srcobject, vm_object_t dstobject,
     vm_pindex_t offset, int destroysource)
 {
 	vm_pindex_t i;
 
 	VM_OBJECT_LOCK_ASSERT(srcobject, MA_OWNED);
 	VM_OBJECT_LOCK_ASSERT(dstobject, MA_OWNED);
 
 	/*
 	 * If destroysource is set, we remove the source object from the
 	 * swap_pager internal queue now.
 	 */
 	if (destroysource) {
 		if (srcobject->handle != NULL) {
 			mtx_lock(&sw_alloc_mtx);
 			TAILQ_REMOVE(
 			    NOBJLIST(srcobject->handle),
 			    srcobject,
 			    pager_object_list
 			);
 			mtx_unlock(&sw_alloc_mtx);
 		}
 	}
 
 	/*
 	 * transfer source to destination.
 	 */
 	for (i = 0; i < dstobject->size; ++i) {
 		daddr_t dstaddr;
 
 		/*
 		 * Locate (without changing) the swapblk on the destination,
 		 * unless it is invalid in which case free it silently, or
 		 * if the destination is a resident page, in which case the
 		 * source is thrown away.
 		 */
 		dstaddr = swp_pager_meta_ctl(dstobject, i, 0);
 
 		if (dstaddr == SWAPBLK_NONE) {
 			/*
 			 * Destination has no swapblk and is not resident,
 			 * copy source.
 			 */
 			daddr_t srcaddr;
 
 			srcaddr = swp_pager_meta_ctl(
 			    srcobject,
 			    i + offset,
 			    SWM_POP
 			);
 
 			if (srcaddr != SWAPBLK_NONE) {
 				/*
 				 * swp_pager_meta_build() can sleep.
 				 */
 				vm_object_pip_add(srcobject, 1);
 				VM_OBJECT_UNLOCK(srcobject);
 				vm_object_pip_add(dstobject, 1);
 				swp_pager_meta_build(dstobject, i, srcaddr);
 				vm_object_pip_wakeup(dstobject);
 				VM_OBJECT_LOCK(srcobject);
 				vm_object_pip_wakeup(srcobject);
 			}
 		} else {
 			/*
 			 * Destination has valid swapblk or it is represented
 			 * by a resident page.  We destroy the sourceblock.
 			 */
 
 			swp_pager_meta_ctl(srcobject, i + offset, SWM_FREE);
 		}
 	}
 
 	/*
 	 * Free left over swap blocks in source.
 	 *
 	 * We have to revert the type to OBJT_DEFAULT so we do not accidently
 	 * double-remove the object from the swap queues.
 	 */
 	if (destroysource) {
 		swp_pager_meta_free_all(srcobject);
 		/*
 		 * Reverting the type is not necessary, the caller is going
 		 * to destroy srcobject directly, but I'm doing it here
 		 * for consistency since we've removed the object from its
 		 * queues.
 		 */
 		srcobject->type = OBJT_DEFAULT;
 	}
 }
 
 /*
  * SWAP_PAGER_HASPAGE() -	determine if we have good backing store for
  *				the requested page.
  *
  *	We determine whether good backing store exists for the requested
  *	page and return TRUE if it does, FALSE if it doesn't.
  *
  *	If TRUE, we also try to determine how much valid, contiguous backing
  *	store exists before and after the requested page within a reasonable
  *	distance.  We do not try to restrict it to the swap device stripe
  *	(that is handled in getpages/putpages).  It probably isn't worth
  *	doing here.
  */
 static boolean_t
 swap_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, int *after)
 {
 	daddr_t blk0;
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	/*
 	 * do we have good backing store at the requested index ?
 	 */
 	blk0 = swp_pager_meta_ctl(object, pindex, 0);
 
 	if (blk0 == SWAPBLK_NONE) {
 		if (before)
 			*before = 0;
 		if (after)
 			*after = 0;
 		return (FALSE);
 	}
 
 	/*
 	 * find backwards-looking contiguous good backing store
 	 */
 	if (before != NULL) {
 		int i;
 
 		for (i = 1; i < (SWB_NPAGES/2); ++i) {
 			daddr_t blk;
 
 			if (i > pindex)
 				break;
 			blk = swp_pager_meta_ctl(object, pindex - i, 0);
 			if (blk != blk0 - i)
 				break;
 		}
 		*before = (i - 1);
 	}
 
 	/*
 	 * find forward-looking contiguous good backing store
 	 */
 	if (after != NULL) {
 		int i;
 
 		for (i = 1; i < (SWB_NPAGES/2); ++i) {
 			daddr_t blk;
 
 			blk = swp_pager_meta_ctl(object, pindex + i, 0);
 			if (blk != blk0 + i)
 				break;
 		}
 		*after = (i - 1);
 	}
 	return (TRUE);
 }
 
 /*
  * SWAP_PAGER_PAGE_UNSWAPPED() - remove swap backing store related to page
  *
  *	This removes any associated swap backing store, whether valid or
  *	not, from the page.
  *
  *	This routine is typically called when a page is made dirty, at
  *	which point any associated swap can be freed.  MADV_FREE also
  *	calls us in a special-case situation
  *
  *	NOTE!!!  If the page is clean and the swap was valid, the caller
  *	should make the page dirty before calling this routine.  This routine
  *	does NOT change the m->dirty status of the page.  Also: MADV_FREE
  *	depends on it.
  *
  *	This routine may not sleep.
  */
 static void
 swap_pager_unswapped(vm_page_t m)
 {
 
 	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
 	swp_pager_meta_ctl(m->object, m->pindex, SWM_FREE);
 }
 
 /*
  * SWAP_PAGER_GETPAGES() - bring pages in from swap
  *
  *	Attempt to retrieve (m, count) pages from backing store, but make
  *	sure we retrieve at least m[reqpage].  We try to load in as large
  *	a chunk surrounding m[reqpage] as is contiguous in swap and which
  *	belongs to the same object.
  *
  *	The code is designed for asynchronous operation and
  *	immediate-notification of 'reqpage' but tends not to be
  *	used that way.  Please do not optimize-out this algorithmic
  *	feature, I intend to improve on it in the future.
  *
  *	The parent has a single vm_object_pip_add() reference prior to
  *	calling us and we should return with the same.
  *
  *	The parent has BUSY'd the pages.  We should return with 'm'
  *	left busy, but the others adjusted.
  */
 static int
 swap_pager_getpages(vm_object_t object, vm_page_t *m, int count, int reqpage)
 {
 	struct buf *bp;
 	vm_page_t mreq;
 	int i;
 	int j;
 	daddr_t blk;
 
 	mreq = m[reqpage];
 
 	KASSERT(mreq->object == object,
 	    ("swap_pager_getpages: object mismatch %p/%p",
 	    object, mreq->object));
 
 	/*
 	 * Calculate range to retrieve.  The pages have already been assigned
 	 * their swapblks.  We require a *contiguous* range but we know it to
 	 * not span devices.   If we do not supply it, bad things
 	 * happen.  Note that blk, iblk & jblk can be SWAPBLK_NONE, but the
 	 * loops are set up such that the case(s) are handled implicitly.
 	 *
 	 * The swp_*() calls must be made with the object locked.
 	 */
 	blk = swp_pager_meta_ctl(mreq->object, mreq->pindex, 0);
 
 	for (i = reqpage - 1; i >= 0; --i) {
 		daddr_t iblk;
 
 		iblk = swp_pager_meta_ctl(m[i]->object, m[i]->pindex, 0);
 		if (blk != iblk + (reqpage - i))
 			break;
 	}
 	++i;
 
 	for (j = reqpage + 1; j < count; ++j) {
 		daddr_t jblk;
 
 		jblk = swp_pager_meta_ctl(m[j]->object, m[j]->pindex, 0);
 		if (blk != jblk - (j - reqpage))
 			break;
 	}
 
 	/*
 	 * free pages outside our collection range.   Note: we never free
 	 * mreq, it must remain busy throughout.
 	 */
 	if (0 < i || j < count) {
 		int k;
 
 		for (k = 0; k < i; ++k)
 			swp_pager_free_nrpage(m[k]);
 		for (k = j; k < count; ++k)
 			swp_pager_free_nrpage(m[k]);
 	}
 
 	/*
 	 * Return VM_PAGER_FAIL if we have nothing to do.  Return mreq
 	 * still busy, but the others unbusied.
 	 */
 	if (blk == SWAPBLK_NONE)
 		return (VM_PAGER_FAIL);
 
 	/*
 	 * Getpbuf() can sleep.
 	 */
 	VM_OBJECT_UNLOCK(object);
 	/*
 	 * Get a swap buffer header to perform the IO
 	 */
 	bp = getpbuf(&nsw_rcount);
 	bp->b_flags |= B_PAGING;
 
 	/*
 	 * map our page(s) into kva for input
 	 */
 	pmap_qenter((vm_offset_t)bp->b_data, m + i, j - i);
 
 	bp->b_iocmd = BIO_READ;
 	bp->b_iodone = swp_pager_async_iodone;
 	bp->b_rcred = crhold(thread0.td_ucred);
 	bp->b_wcred = crhold(thread0.td_ucred);
 	bp->b_blkno = blk - (reqpage - i);
 	bp->b_bcount = PAGE_SIZE * (j - i);
 	bp->b_bufsize = PAGE_SIZE * (j - i);
 	bp->b_pager.pg_reqpage = reqpage - i;
 
 	VM_OBJECT_LOCK(object);
 	{
 		int k;
 
 		for (k = i; k < j; ++k) {
 			bp->b_pages[k - i] = m[k];
 			m[k]->oflags |= VPO_SWAPINPROG;
 		}
 	}
 	bp->b_npages = j - i;
 
 	PCPU_INC(cnt.v_swapin);
 	PCPU_ADD(cnt.v_swappgsin, bp->b_npages);
 
 	/*
 	 * We still hold the lock on mreq, and our automatic completion routine
 	 * does not remove it.
 	 */
 	vm_object_pip_add(object, bp->b_npages);
 	VM_OBJECT_UNLOCK(object);
 
 	/*
 	 * perform the I/O.  NOTE!!!  bp cannot be considered valid after
 	 * this point because we automatically release it on completion.
 	 * Instead, we look at the one page we are interested in which we
 	 * still hold a lock on even through the I/O completion.
 	 *
 	 * The other pages in our m[] array are also released on completion,
 	 * so we cannot assume they are valid anymore either.
 	 *
 	 * NOTE: b_blkno is destroyed by the call to swapdev_strategy
 	 */
 	BUF_KERNPROC(bp);
 	swp_pager_strategy(bp);
 
 	/*
 	 * wait for the page we want to complete.  VPO_SWAPINPROG is always
 	 * cleared on completion.  If an I/O error occurs, SWAPBLK_NONE
 	 * is set in the meta-data.
 	 */
 	VM_OBJECT_LOCK(object);
 	while ((mreq->oflags & VPO_SWAPINPROG) != 0) {
 		mreq->oflags |= VPO_WANTED;
 		PCPU_INC(cnt.v_intrans);
 		if (msleep(mreq, VM_OBJECT_MTX(object), PSWP, "swread", hz*20)) {
 			printf(
 "swap_pager: indefinite wait buffer: bufobj: %p, blkno: %jd, size: %ld\n",
 			    bp->b_bufobj, (intmax_t)bp->b_blkno, bp->b_bcount);
 		}
 	}
 
 	/*
 	 * mreq is left busied after completion, but all the other pages
 	 * are freed.  If we had an unrecoverable read error the page will
 	 * not be valid.
 	 */
 	if (mreq->valid != VM_PAGE_BITS_ALL) {
 		return (VM_PAGER_ERROR);
 	} else {
 		return (VM_PAGER_OK);
 	}
 
 	/*
 	 * A final note: in a low swap situation, we cannot deallocate swap
 	 * and mark a page dirty here because the caller is likely to mark
 	 * the page clean when we return, causing the page to possibly revert
 	 * to all-zero's later.
 	 */
 }
 
 /*
  *	swap_pager_putpages:
  *
  *	Assign swap (if necessary) and initiate I/O on the specified pages.
  *
  *	We support both OBJT_DEFAULT and OBJT_SWAP objects.  DEFAULT objects
  *	are automatically converted to SWAP objects.
  *
  *	In a low memory situation we may block in VOP_STRATEGY(), but the new
  *	vm_page reservation system coupled with properly written VFS devices
  *	should ensure that no low-memory deadlock occurs.  This is an area
  *	which needs work.
  *
  *	The parent has N vm_object_pip_add() references prior to
  *	calling us and will remove references for rtvals[] that are
  *	not set to VM_PAGER_PEND.  We need to remove the rest on I/O
  *	completion.
  *
  *	The parent has soft-busy'd the pages it passes us and will unbusy
  *	those whos rtvals[] entry is not set to VM_PAGER_PEND on return.
  *	We need to unbusy the rest on I/O completion.
  */
 void
 swap_pager_putpages(vm_object_t object, vm_page_t *m, int count,
     boolean_t sync, int *rtvals)
 {
 	int i;
 	int n = 0;
 
 	if (count && m[0]->object != object) {
 		panic("swap_pager_putpages: object mismatch %p/%p",
 		    object,
 		    m[0]->object
 		);
 	}
 
 	/*
 	 * Step 1
 	 *
 	 * Turn object into OBJT_SWAP
 	 * check for bogus sysops
 	 * force sync if not pageout process
 	 */
 	if (object->type != OBJT_SWAP)
 		swp_pager_meta_build(object, 0, SWAPBLK_NONE);
 	VM_OBJECT_UNLOCK(object);
 
 	if (curproc != pageproc)
 		sync = TRUE;
 
 	/*
 	 * Step 2
 	 *
 	 * Update nsw parameters from swap_async_max sysctl values.
 	 * Do not let the sysop crash the machine with bogus numbers.
 	 */
 	mtx_lock(&pbuf_mtx);
 	if (swap_async_max != nsw_wcount_async_max) {
 		int n;
 
 		/*
 		 * limit range
 		 */
 		if ((n = swap_async_max) > nswbuf / 2)
 			n = nswbuf / 2;
 		if (n < 1)
 			n = 1;
 		swap_async_max = n;
 
 		/*
 		 * Adjust difference ( if possible ).  If the current async
 		 * count is too low, we may not be able to make the adjustment
 		 * at this time.
 		 */
 		n -= nsw_wcount_async_max;
 		if (nsw_wcount_async + n >= 0) {
 			nsw_wcount_async += n;
 			nsw_wcount_async_max += n;
 			wakeup(&nsw_wcount_async);
 		}
 	}
 	mtx_unlock(&pbuf_mtx);
 
 	/*
 	 * Step 3
 	 *
 	 * Assign swap blocks and issue I/O.  We reallocate swap on the fly.
 	 * The page is left dirty until the pageout operation completes
 	 * successfully.
 	 */
 	for (i = 0; i < count; i += n) {
 		int j;
 		struct buf *bp;
 		daddr_t blk;
 
 		/*
 		 * Maximum I/O size is limited by a number of factors.
 		 */
 		n = min(BLIST_MAX_ALLOC, count - i);
 		n = min(n, nsw_cluster_max);
 
 		/*
 		 * Get biggest block of swap we can.  If we fail, fall
 		 * back and try to allocate a smaller block.  Don't go
 		 * overboard trying to allocate space if it would overly
 		 * fragment swap.
 		 */
 		while (
 		    (blk = swp_pager_getswapspace(n)) == SWAPBLK_NONE &&
 		    n > 4
 		) {
 			n >>= 1;
 		}
 		if (blk == SWAPBLK_NONE) {
 			for (j = 0; j < n; ++j)
 				rtvals[i+j] = VM_PAGER_FAIL;
 			continue;
 		}
 
 		/*
 		 * All I/O parameters have been satisfied, build the I/O
 		 * request and assign the swap space.
 		 */
 		if (sync == TRUE) {
 			bp = getpbuf(&nsw_wcount_sync);
 		} else {
 			bp = getpbuf(&nsw_wcount_async);
 			bp->b_flags = B_ASYNC;
 		}
 		bp->b_flags |= B_PAGING;
 		bp->b_iocmd = BIO_WRITE;
 
 		pmap_qenter((vm_offset_t)bp->b_data, &m[i], n);
 
 		bp->b_rcred = crhold(thread0.td_ucred);
 		bp->b_wcred = crhold(thread0.td_ucred);
 		bp->b_bcount = PAGE_SIZE * n;
 		bp->b_bufsize = PAGE_SIZE * n;
 		bp->b_blkno = blk;
 
 		VM_OBJECT_LOCK(object);
 		for (j = 0; j < n; ++j) {
 			vm_page_t mreq = m[i+j];
 
 			swp_pager_meta_build(
 			    mreq->object,
 			    mreq->pindex,
 			    blk + j
 			);
 			vm_page_dirty(mreq);
 			rtvals[i+j] = VM_PAGER_OK;
 
 			mreq->oflags |= VPO_SWAPINPROG;
 			bp->b_pages[j] = mreq;
 		}
 		VM_OBJECT_UNLOCK(object);
 		bp->b_npages = n;
 		/*
 		 * Must set dirty range for NFS to work.
 		 */
 		bp->b_dirtyoff = 0;
 		bp->b_dirtyend = bp->b_bcount;
 
 		PCPU_INC(cnt.v_swapout);
 		PCPU_ADD(cnt.v_swappgsout, bp->b_npages);
 
 		/*
 		 * asynchronous
 		 *
 		 * NOTE: b_blkno is destroyed by the call to swapdev_strategy
 		 */
 		if (sync == FALSE) {
 			bp->b_iodone = swp_pager_async_iodone;
 			BUF_KERNPROC(bp);
 			swp_pager_strategy(bp);
 
 			for (j = 0; j < n; ++j)
 				rtvals[i+j] = VM_PAGER_PEND;
 			/* restart outter loop */
 			continue;
 		}
 
 		/*
 		 * synchronous
 		 *
 		 * NOTE: b_blkno is destroyed by the call to swapdev_strategy
 		 */
 		bp->b_iodone = bdone;
 		swp_pager_strategy(bp);
 
 		/*
 		 * Wait for the sync I/O to complete, then update rtvals.
 		 * We just set the rtvals[] to VM_PAGER_PEND so we can call
 		 * our async completion routine at the end, thus avoiding a
 		 * double-free.
 		 */
 		bwait(bp, PVM, "swwrt");
 		for (j = 0; j < n; ++j)
 			rtvals[i+j] = VM_PAGER_PEND;
 		/*
 		 * Now that we are through with the bp, we can call the
 		 * normal async completion, which frees everything up.
 		 */
 		swp_pager_async_iodone(bp);
 	}
 	VM_OBJECT_LOCK(object);
 }
 
 /*
  *	swp_pager_async_iodone:
  *
  *	Completion routine for asynchronous reads and writes from/to swap.
  *	Also called manually by synchronous code to finish up a bp.
  *
  *	For READ operations, the pages are VPO_BUSY'd.  For WRITE operations,
  *	the pages are vm_page_t->busy'd.  For READ operations, we VPO_BUSY
  *	unbusy all pages except the 'main' request page.  For WRITE
  *	operations, we vm_page_t->busy'd unbusy all pages ( we can do this
  *	because we marked them all VM_PAGER_PEND on return from putpages ).
  *
  *	This routine may not sleep.
  */
 static void
 swp_pager_async_iodone(struct buf *bp)
 {
 	int i;
 	vm_object_t object = NULL;
 
 	/*
 	 * report error
 	 */
 	if (bp->b_ioflags & BIO_ERROR) {
 		printf(
 		    "swap_pager: I/O error - %s failed; blkno %ld,"
 			"size %ld, error %d\n",
 		    ((bp->b_iocmd == BIO_READ) ? "pagein" : "pageout"),
 		    (long)bp->b_blkno,
 		    (long)bp->b_bcount,
 		    bp->b_error
 		);
 	}
 
 	/*
 	 * remove the mapping for kernel virtual
 	 */
 	pmap_qremove((vm_offset_t)bp->b_data, bp->b_npages);
 
 	if (bp->b_npages) {
 		object = bp->b_pages[0]->object;
 		VM_OBJECT_LOCK(object);
 	}
 
 	/*
 	 * cleanup pages.  If an error occurs writing to swap, we are in
 	 * very serious trouble.  If it happens to be a disk error, though,
 	 * we may be able to recover by reassigning the swap later on.  So
 	 * in this case we remove the m->swapblk assignment for the page
 	 * but do not free it in the rlist.  The errornous block(s) are thus
 	 * never reallocated as swap.  Redirty the page and continue.
 	 */
 	for (i = 0; i < bp->b_npages; ++i) {
 		vm_page_t m = bp->b_pages[i];
 
 		m->oflags &= ~VPO_SWAPINPROG;
 
 		if (bp->b_ioflags & BIO_ERROR) {
 			/*
 			 * If an error occurs I'd love to throw the swapblk
 			 * away without freeing it back to swapspace, so it
 			 * can never be used again.  But I can't from an
 			 * interrupt.
 			 */
 			if (bp->b_iocmd == BIO_READ) {
 				/*
 				 * When reading, reqpage needs to stay
 				 * locked for the parent, but all other
 				 * pages can be freed.  We still want to
 				 * wakeup the parent waiting on the page,
 				 * though.  ( also: pg_reqpage can be -1 and
 				 * not match anything ).
 				 *
 				 * We have to wake specifically requested pages
 				 * up too because we cleared VPO_SWAPINPROG and
 				 * someone may be waiting for that.
 				 *
 				 * NOTE: for reads, m->dirty will probably
 				 * be overridden by the original caller of
 				 * getpages so don't play cute tricks here.
 				 */
 				m->valid = 0;
 				if (i != bp->b_pager.pg_reqpage)
 					swp_pager_free_nrpage(m);
 				else
 					vm_page_flash(m);
 				/*
 				 * If i == bp->b_pager.pg_reqpage, do not wake
 				 * the page up.  The caller needs to.
 				 */
 			} else {
 				/*
 				 * If a write error occurs, reactivate page
 				 * so it doesn't clog the inactive list,
 				 * then finish the I/O.
 				 */
 				vm_page_dirty(m);
 				vm_page_lock(m);
 				vm_page_activate(m);
 				vm_page_unlock(m);
 				vm_page_io_finish(m);
 			}
 		} else if (bp->b_iocmd == BIO_READ) {
 			/*
 			 * NOTE: for reads, m->dirty will probably be
 			 * overridden by the original caller of getpages so
 			 * we cannot set them in order to free the underlying
 			 * swap in a low-swap situation.  I don't think we'd
 			 * want to do that anyway, but it was an optimization
 			 * that existed in the old swapper for a time before
 			 * it got ripped out due to precisely this problem.
 			 *
 			 * If not the requested page then deactivate it.
 			 *
 			 * Note that the requested page, reqpage, is left
 			 * busied, but we still have to wake it up.  The
 			 * other pages are released (unbusied) by
 			 * vm_page_wakeup().
 			 */
 			KASSERT(!pmap_page_is_mapped(m),
 			    ("swp_pager_async_iodone: page %p is mapped", m));
 			m->valid = VM_PAGE_BITS_ALL;
 			KASSERT(m->dirty == 0,
 			    ("swp_pager_async_iodone: page %p is dirty", m));
 
 			/*
 			 * We have to wake specifically requested pages
 			 * up too because we cleared VPO_SWAPINPROG and
 			 * could be waiting for it in getpages.  However,
 			 * be sure to not unbusy getpages specifically
 			 * requested page - getpages expects it to be
 			 * left busy.
 			 */
 			if (i != bp->b_pager.pg_reqpage) {
 				vm_page_lock(m);
 				vm_page_deactivate(m);
 				vm_page_unlock(m);
 				vm_page_wakeup(m);
 			} else
 				vm_page_flash(m);
 		} else {
 			/*
 			 * For write success, clear the dirty
 			 * status, then finish the I/O ( which decrements the
 			 * busy count and possibly wakes waiter's up ).
 			 */
 			KASSERT(!pmap_page_is_write_mapped(m),
 			    ("swp_pager_async_iodone: page %p is not write"
 			    " protected", m));
 			vm_page_undirty(m);
 			vm_page_io_finish(m);
 			if (vm_page_count_severe()) {
 				vm_page_lock(m);
 				vm_page_try_to_cache(m);
 				vm_page_unlock(m);
 			}
 		}
 	}
 
 	/*
 	 * adjust pip.  NOTE: the original parent may still have its own
 	 * pip refs on the object.
 	 */
 	if (object != NULL) {
 		vm_object_pip_wakeupn(object, bp->b_npages);
 		VM_OBJECT_UNLOCK(object);
 	}
 
 	/*
 	 * swapdev_strategy() manually sets b_vp and b_bufobj before calling
 	 * bstrategy(). Set them back to NULL now we're done with it, or we'll
 	 * trigger a KASSERT in relpbuf().
 	 */
 	if (bp->b_vp) {
 		    bp->b_vp = NULL;
 		    bp->b_bufobj = NULL;
 	}
 	/*
 	 * release the physical I/O buffer
 	 */
 	relpbuf(
 	    bp,
 	    ((bp->b_iocmd == BIO_READ) ? &nsw_rcount :
 		((bp->b_flags & B_ASYNC) ?
 		    &nsw_wcount_async :
 		    &nsw_wcount_sync
 		)
 	    )
 	);
 }
 
 /*
  *	swap_pager_isswapped:
  *
  *	Return 1 if at least one page in the given object is paged
  *	out to the given swap device.
  *
  *	This routine may not sleep.
  */
 int
 swap_pager_isswapped(vm_object_t object, struct swdevt *sp)
 {
 	daddr_t index = 0;
 	int bcount;
 	int i;
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	if (object->type != OBJT_SWAP)
 		return (0);
 
 	mtx_lock(&swhash_mtx);
 	for (bcount = 0; bcount < object->un_pager.swp.swp_bcount; bcount++) {
 		struct swblock *swap;
 
 		if ((swap = *swp_pager_hash(object, index)) != NULL) {
 			for (i = 0; i < SWAP_META_PAGES; ++i) {
 				if (swp_pager_isondev(swap->swb_pages[i], sp)) {
 					mtx_unlock(&swhash_mtx);
 					return (1);
 				}
 			}
 		}
 		index += SWAP_META_PAGES;
 	}
 	mtx_unlock(&swhash_mtx);
 	return (0);
 }
 
 /*
  * SWP_PAGER_FORCE_PAGEIN() - force a swap block to be paged in
  *
  *	This routine dissociates the page at the given index within a
  *	swap block from its backing store, paging it in if necessary.
  *	If the page is paged in, it is placed in the inactive queue,
  *	since it had its backing store ripped out from under it.
  *	We also attempt to swap in all other pages in the swap block,
  *	we only guarantee that the one at the specified index is
  *	paged in.
  *
  *	XXX - The code to page the whole block in doesn't work, so we
  *	      revert to the one-by-one behavior for now.  Sigh.
  */
 static inline void
 swp_pager_force_pagein(vm_object_t object, vm_pindex_t pindex)
 {
 	vm_page_t m;
 
 	vm_object_pip_add(object, 1);
 	m = vm_page_grab(object, pindex, VM_ALLOC_NORMAL|VM_ALLOC_RETRY);
 	if (m->valid == VM_PAGE_BITS_ALL) {
 		vm_object_pip_subtract(object, 1);
 		vm_page_dirty(m);
 		vm_page_lock(m);
 		vm_page_activate(m);
 		vm_page_unlock(m);
 		vm_page_wakeup(m);
 		vm_pager_page_unswapped(m);
 		return;
 	}
 
 	if (swap_pager_getpages(object, &m, 1, 0) != VM_PAGER_OK)
 		panic("swap_pager_force_pagein: read from swap failed");/*XXX*/
 	vm_object_pip_subtract(object, 1);
 	vm_page_dirty(m);
 	vm_page_lock(m);
 	vm_page_deactivate(m);
 	vm_page_unlock(m);
 	vm_page_wakeup(m);
 	vm_pager_page_unswapped(m);
 }
 
 /*
  *	swap_pager_swapoff:
  *
  *	Page in all of the pages that have been paged out to the
  *	given device.  The corresponding blocks in the bitmap must be
  *	marked as allocated and the device must be flagged SW_CLOSING.
  *	There may be no processes swapped out to the device.
  *
  *	This routine may block.
  */
 static void
 swap_pager_swapoff(struct swdevt *sp)
 {
 	struct swblock *swap;
 	int i, j, retries;
 
 	GIANT_REQUIRED;
 
 	retries = 0;
 full_rescan:
 	mtx_lock(&swhash_mtx);
 	for (i = 0; i <= swhash_mask; i++) { /* '<=' is correct here */
 restart:
 		for (swap = swhash[i]; swap != NULL; swap = swap->swb_hnext) {
 			vm_object_t object = swap->swb_object;
 			vm_pindex_t pindex = swap->swb_index;
 			for (j = 0; j < SWAP_META_PAGES; ++j) {
 				if (swp_pager_isondev(swap->swb_pages[j], sp)) {
 					/* avoid deadlock */
 					if (!VM_OBJECT_TRYLOCK(object)) {
 						break;
 					} else {
 						mtx_unlock(&swhash_mtx);
 						swp_pager_force_pagein(object,
 						    pindex + j);
 						VM_OBJECT_UNLOCK(object);
 						mtx_lock(&swhash_mtx);
 						goto restart;
 					}
 				}
 			}
 		}
 	}
 	mtx_unlock(&swhash_mtx);
 	if (sp->sw_used) {
 		/*
 		 * Objects may be locked or paging to the device being
 		 * removed, so we will miss their pages and need to
 		 * make another pass.  We have marked this device as
 		 * SW_CLOSING, so the activity should finish soon.
 		 */
 		retries++;
 		if (retries > 100) {
 			panic("swapoff: failed to locate %d swap blocks",
 			    sp->sw_used);
 		}
 		pause("swpoff", hz / 20);
 		goto full_rescan;
 	}
 }
 
 /************************************************************************
  *				SWAP META DATA 				*
  ************************************************************************
  *
  *	These routines manipulate the swap metadata stored in the
  *	OBJT_SWAP object.
  *
  *	Swap metadata is implemented with a global hash and not directly
  *	linked into the object.  Instead the object simply contains
  *	appropriate tracking counters.
  */
 
 /*
  * SWP_PAGER_META_BUILD() -	add swap block to swap meta data for object
  *
  *	We first convert the object to a swap object if it is a default
  *	object.
  *
  *	The specified swapblk is added to the object's swap metadata.  If
  *	the swapblk is not valid, it is freed instead.  Any previously
  *	assigned swapblk is freed.
  */
 static void
 swp_pager_meta_build(vm_object_t object, vm_pindex_t pindex, daddr_t swapblk)
 {
 	static volatile int exhausted;
 	struct swblock *swap;
 	struct swblock **pswap;
 	int idx;
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	/*
 	 * Convert default object to swap object if necessary
 	 */
 	if (object->type != OBJT_SWAP) {
 		object->type = OBJT_SWAP;
 		object->un_pager.swp.swp_bcount = 0;
 
 		if (object->handle != NULL) {
 			mtx_lock(&sw_alloc_mtx);
 			TAILQ_INSERT_TAIL(
 			    NOBJLIST(object->handle),
 			    object,
 			    pager_object_list
 			);
 			mtx_unlock(&sw_alloc_mtx);
 		}
 	}
 
 	/*
 	 * Locate hash entry.  If not found create, but if we aren't adding
 	 * anything just return.  If we run out of space in the map we wait
 	 * and, since the hash table may have changed, retry.
 	 */
 retry:
 	mtx_lock(&swhash_mtx);
 	pswap = swp_pager_hash(object, pindex);
 
 	if ((swap = *pswap) == NULL) {
 		int i;
 
 		if (swapblk == SWAPBLK_NONE)
 			goto done;
 
 		swap = *pswap = uma_zalloc(swap_zone, M_NOWAIT);
 		if (swap == NULL) {
 			mtx_unlock(&swhash_mtx);
 			VM_OBJECT_UNLOCK(object);
 			if (uma_zone_exhausted(swap_zone)) {
 				if (atomic_cmpset_int(&exhausted, 0, 1))
 					printf("swap zone exhausted, "
 					    "increase kern.maxswzone\n");
 				vm_pageout_oom(VM_OOM_SWAPZ);
 				pause("swzonex", 10);
 			} else
 				VM_WAIT;
 			VM_OBJECT_LOCK(object);
 			goto retry;
 		}
 
 		if (atomic_cmpset_int(&exhausted, 1, 0))
 			printf("swap zone ok\n");
 
 		swap->swb_hnext = NULL;
 		swap->swb_object = object;
 		swap->swb_index = pindex & ~(vm_pindex_t)SWAP_META_MASK;
 		swap->swb_count = 0;
 
 		++object->un_pager.swp.swp_bcount;
 
 		for (i = 0; i < SWAP_META_PAGES; ++i)
 			swap->swb_pages[i] = SWAPBLK_NONE;
 	}
 
 	/*
 	 * Delete prior contents of metadata
 	 */
 	idx = pindex & SWAP_META_MASK;
 
 	if (swap->swb_pages[idx] != SWAPBLK_NONE) {
 		swp_pager_freeswapspace(swap->swb_pages[idx], 1);
 		--swap->swb_count;
 	}
 
 	/*
 	 * Enter block into metadata
 	 */
 	swap->swb_pages[idx] = swapblk;
 	if (swapblk != SWAPBLK_NONE)
 		++swap->swb_count;
 done:
 	mtx_unlock(&swhash_mtx);
 }
 
 /*
  * SWP_PAGER_META_FREE() - free a range of blocks in the object's swap metadata
  *
  *	The requested range of blocks is freed, with any associated swap
  *	returned to the swap bitmap.
  *
  *	This routine will free swap metadata structures as they are cleaned
  *	out.  This routine does *NOT* operate on swap metadata associated
  *	with resident pages.
  */
 static void
 swp_pager_meta_free(vm_object_t object, vm_pindex_t index, daddr_t count)
 {
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	if (object->type != OBJT_SWAP)
 		return;
 
 	while (count > 0) {
 		struct swblock **pswap;
 		struct swblock *swap;
 
 		mtx_lock(&swhash_mtx);
 		pswap = swp_pager_hash(object, index);
 
 		if ((swap = *pswap) != NULL) {
 			daddr_t v = swap->swb_pages[index & SWAP_META_MASK];
 
 			if (v != SWAPBLK_NONE) {
 				swp_pager_freeswapspace(v, 1);
 				swap->swb_pages[index & SWAP_META_MASK] =
 					SWAPBLK_NONE;
 				if (--swap->swb_count == 0) {
 					*pswap = swap->swb_hnext;
 					uma_zfree(swap_zone, swap);
 					--object->un_pager.swp.swp_bcount;
 				}
 			}
 			--count;
 			++index;
 		} else {
 			int n = SWAP_META_PAGES - (index & SWAP_META_MASK);
 			count -= n;
 			index += n;
 		}
 		mtx_unlock(&swhash_mtx);
 	}
 }
 
 /*
  * SWP_PAGER_META_FREE_ALL() - destroy all swap metadata associated with object
  *
  *	This routine locates and destroys all swap metadata associated with
  *	an object.
  */
 static void
 swp_pager_meta_free_all(vm_object_t object)
 {
 	daddr_t index = 0;
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	if (object->type != OBJT_SWAP)
 		return;
 
 	while (object->un_pager.swp.swp_bcount) {
 		struct swblock **pswap;
 		struct swblock *swap;
 
 		mtx_lock(&swhash_mtx);
 		pswap = swp_pager_hash(object, index);
 		if ((swap = *pswap) != NULL) {
 			int i;
 
 			for (i = 0; i < SWAP_META_PAGES; ++i) {
 				daddr_t v = swap->swb_pages[i];
 				if (v != SWAPBLK_NONE) {
 					--swap->swb_count;
 					swp_pager_freeswapspace(v, 1);
 				}
 			}
 			if (swap->swb_count != 0)
 				panic("swap_pager_meta_free_all: swb_count != 0");
 			*pswap = swap->swb_hnext;
 			uma_zfree(swap_zone, swap);
 			--object->un_pager.swp.swp_bcount;
 		}
 		mtx_unlock(&swhash_mtx);
 		index += SWAP_META_PAGES;
 	}
 }
 
 /*
  * SWP_PAGER_METACTL() -  misc control of swap and vm_page_t meta data.
  *
  *	This routine is capable of looking up, popping, or freeing
  *	swapblk assignments in the swap meta data or in the vm_page_t.
  *	The routine typically returns the swapblk being looked-up, or popped,
  *	or SWAPBLK_NONE if the block was freed, or SWAPBLK_NONE if the block
  *	was invalid.  This routine will automatically free any invalid
  *	meta-data swapblks.
  *
  *	It is not possible to store invalid swapblks in the swap meta data
  *	(other then a literal 'SWAPBLK_NONE'), so we don't bother checking.
  *
  *	When acting on a busy resident page and paging is in progress, we
  *	have to wait until paging is complete but otherwise can act on the
  *	busy page.
  *
  *	SWM_FREE	remove and free swap block from metadata
  *	SWM_POP		remove from meta data but do not free.. pop it out
  */
 static daddr_t
 swp_pager_meta_ctl(vm_object_t object, vm_pindex_t pindex, int flags)
 {
 	struct swblock **pswap;
 	struct swblock *swap;
 	daddr_t r1;
 	int idx;
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	/*
 	 * The meta data only exists of the object is OBJT_SWAP
 	 * and even then might not be allocated yet.
 	 */
 	if (object->type != OBJT_SWAP)
 		return (SWAPBLK_NONE);
 
 	r1 = SWAPBLK_NONE;
 	mtx_lock(&swhash_mtx);
 	pswap = swp_pager_hash(object, pindex);
 
 	if ((swap = *pswap) != NULL) {
 		idx = pindex & SWAP_META_MASK;
 		r1 = swap->swb_pages[idx];
 
 		if (r1 != SWAPBLK_NONE) {
 			if (flags & SWM_FREE) {
 				swp_pager_freeswapspace(r1, 1);
 				r1 = SWAPBLK_NONE;
 			}
 			if (flags & (SWM_FREE|SWM_POP)) {
 				swap->swb_pages[idx] = SWAPBLK_NONE;
 				if (--swap->swb_count == 0) {
 					*pswap = swap->swb_hnext;
 					uma_zfree(swap_zone, swap);
 					--object->un_pager.swp.swp_bcount;
 				}
 			}
 		}
 	}
 	mtx_unlock(&swhash_mtx);
 	return (r1);
 }
 
 /*
  * System call swapon(name) enables swapping on device name,
  * which must be in the swdevsw.  Return EBUSY
  * if already swapping on this device.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct swapon_args {
 	char *name;
 };
 #endif
 
 /*
  * MPSAFE
  */
 /* ARGSUSED */
 int
 sys_swapon(struct thread *td, struct swapon_args *uap)
 {
 	struct vattr attr;
 	struct vnode *vp;
 	struct nameidata nd;
 	int error;
 
 	error = priv_check(td, PRIV_SWAPON);
 	if (error)
 		return (error);
 
 	mtx_lock(&Giant);
 	while (swdev_syscall_active)
 	    tsleep(&swdev_syscall_active, PUSER - 1, "swpon", 0);
 	swdev_syscall_active = 1;
 
 	/*
 	 * Swap metadata may not fit in the KVM if we have physical
 	 * memory of >1GB.
 	 */
 	if (swap_zone == NULL) {
 		error = ENOMEM;
 		goto done;
 	}
 
 	NDINIT(&nd, LOOKUP, ISOPEN | FOLLOW | AUDITVNODE1, UIO_USERSPACE,
 	    uap->name, td);
 	error = namei(&nd);
 	if (error)
 		goto done;
 
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vp = nd.ni_vp;
 
 	if (vn_isdisk(vp, &error)) {
 		error = swapongeom(td, vp);
 	} else if (vp->v_type == VREG &&
 	    (vp->v_mount->mnt_vfc->vfc_flags & VFCF_NETWORK) != 0 &&
 	    (error = VOP_GETATTR(vp, &attr, td->td_ucred)) == 0) {
 		/*
 		 * Allow direct swapping to NFS regular files in the same
 		 * way that nfs_mountroot() sets up diskless swapping.
 		 */
 		error = swaponvp(td, vp, attr.va_size / DEV_BSIZE);
 	}
 
 	if (error)
 		vrele(vp);
 done:
 	swdev_syscall_active = 0;
 	wakeup_one(&swdev_syscall_active);
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 /*
  * Check that the total amount of swap currently configured does not
  * exceed half the theoretical maximum.  If it does, print a warning
  * message and return -1; otherwise, return 0.
  */
 static int
 swapon_check_swzone(unsigned long npages)
 {
 	unsigned long maxpages;
 
 	/* absolute maximum we can handle assuming 100% efficiency */
 	maxpages = uma_zone_get_max(swap_zone) * SWAP_META_PAGES;
 
 	/* recommend using no more than half that amount */
 	if (npages > maxpages / 2) {
 		printf("warning: total configured swap (%lu pages) "
 		    "exceeds maximum recommended amount (%lu pages).\n",
 		    npages, maxpages / 2);
 		printf("warning: increase kern.maxswzone "
 		    "or reduce amount of swap.\n");
 		return (-1);
 	}
 	return (0);
 }
 
 static void
 swaponsomething(struct vnode *vp, void *id, u_long nblks, sw_strategy_t *strategy, sw_close_t *close, dev_t dev)
 {
 	struct swdevt *sp, *tsp;
 	swblk_t dvbase;
 	u_long mblocks;
 
 	/*
 	 * nblks is in DEV_BSIZE'd chunks, convert to PAGE_SIZE'd chunks.
 	 * First chop nblks off to page-align it, then convert.
 	 *
 	 * sw->sw_nblks is in page-sized chunks now too.
 	 */
 	nblks &= ~(ctodb(1) - 1);
 	nblks = dbtoc(nblks);
 
 	/*
 	 * If we go beyond this, we get overflows in the radix
 	 * tree bitmap code.
 	 */
 	mblocks = 0x40000000 / BLIST_META_RADIX;
 	if (nblks > mblocks) {
 		printf(
     "WARNING: reducing swap size to maximum of %luMB per unit\n",
 		    mblocks / 1024 / 1024 * PAGE_SIZE);
 		nblks = mblocks;
 	}
 
 	sp = malloc(sizeof *sp, M_VMPGDATA, M_WAITOK | M_ZERO);
 	sp->sw_vp = vp;
 	sp->sw_id = id;
 	sp->sw_dev = dev;
 	sp->sw_flags = 0;
 	sp->sw_nblks = nblks;
 	sp->sw_used = 0;
 	sp->sw_strategy = strategy;
 	sp->sw_close = close;
 
 	sp->sw_blist = blist_create(nblks, M_WAITOK);
 	/*
 	 * Do not free the first two block in order to avoid overwriting
 	 * any bsd label at the front of the partition
 	 */
 	blist_free(sp->sw_blist, 2, nblks - 2);
 
 	dvbase = 0;
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_FOREACH(tsp, &swtailq, sw_list) {
 		if (tsp->sw_end >= dvbase) {
 			/*
 			 * We put one uncovered page between the devices
 			 * in order to definitively prevent any cross-device
 			 * I/O requests
 			 */
 			dvbase = tsp->sw_end + 1;
 		}
 	}
 	sp->sw_first = dvbase;
 	sp->sw_end = dvbase + nblks;
 	TAILQ_INSERT_TAIL(&swtailq, sp, sw_list);
 	nswapdev++;
 	swap_pager_avail += nblks;
 	swap_total += (vm_ooffset_t)nblks * PAGE_SIZE;
 	swapon_check_swzone(swap_total / PAGE_SIZE);
 	swp_sizecheck();
 	mtx_unlock(&sw_dev_mtx);
 }
 
 /*
  * SYSCALL: swapoff(devname)
  *
  * Disable swapping on the given device.
  *
  * XXX: Badly designed system call: it should use a device index
  * rather than filename as specification.  We keep sw_vp around
  * only to make this work.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct swapoff_args {
 	char *name;
 };
 #endif
 
 /*
  * MPSAFE
  */
 /* ARGSUSED */
 int
 sys_swapoff(struct thread *td, struct swapoff_args *uap)
 {
 	struct vnode *vp;
 	struct nameidata nd;
 	struct swdevt *sp;
 	int error;
 
 	error = priv_check(td, PRIV_SWAPOFF);
 	if (error)
 		return (error);
 
 	mtx_lock(&Giant);
 	while (swdev_syscall_active)
 	    tsleep(&swdev_syscall_active, PUSER - 1, "swpoff", 0);
 	swdev_syscall_active = 1;
 
 	NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->name,
 	    td);
 	error = namei(&nd);
 	if (error)
 		goto done;
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vp = nd.ni_vp;
 
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_FOREACH(sp, &swtailq, sw_list) {
 		if (sp->sw_vp == vp)
 			break;
 	}
 	mtx_unlock(&sw_dev_mtx);
 	if (sp == NULL) {
 		error = EINVAL;
 		goto done;
 	}
 	error = swapoff_one(sp, td->td_ucred);
 done:
 	swdev_syscall_active = 0;
 	wakeup_one(&swdev_syscall_active);
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 static int
 swapoff_one(struct swdevt *sp, struct ucred *cred)
 {
 	u_long nblks, dvbase;
 #ifdef MAC
 	int error;
 #endif
 
 	mtx_assert(&Giant, MA_OWNED);
 #ifdef MAC
 	(void) vn_lock(sp->sw_vp, LK_EXCLUSIVE | LK_RETRY);
 	error = mac_system_check_swapoff(cred, sp->sw_vp);
 	(void) VOP_UNLOCK(sp->sw_vp, 0);
 	if (error != 0)
 		return (error);
 #endif
 	nblks = sp->sw_nblks;
 
 	/*
 	 * We can turn off this swap device safely only if the
 	 * available virtual memory in the system will fit the amount
 	 * of data we will have to page back in, plus an epsilon so
 	 * the system doesn't become critically low on swap space.
 	 */
 	if (cnt.v_free_count + cnt.v_cache_count + swap_pager_avail <
 	    nblks + nswap_lowat) {
 		return (ENOMEM);
 	}
 
 	/*
 	 * Prevent further allocations on this device.
 	 */
 	mtx_lock(&sw_dev_mtx);
 	sp->sw_flags |= SW_CLOSING;
 	for (dvbase = 0; dvbase < sp->sw_end; dvbase += dmmax) {
 		swap_pager_avail -= blist_fill(sp->sw_blist,
 		     dvbase, dmmax);
 	}
 	swap_total -= (vm_ooffset_t)nblks * PAGE_SIZE;
 	mtx_unlock(&sw_dev_mtx);
 
 	/*
 	 * Page in the contents of the device and close it.
 	 */
 	swap_pager_swapoff(sp);
 
 	sp->sw_close(curthread, sp);
 	sp->sw_id = NULL;
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_REMOVE(&swtailq, sp, sw_list);
 	nswapdev--;
 	if (nswapdev == 0) {
 		swap_pager_full = 2;
 		swap_pager_almost_full = 1;
 	}
 	if (swdevhd == sp)
 		swdevhd = NULL;
 	mtx_unlock(&sw_dev_mtx);
 	blist_destroy(sp->sw_blist);
 	free(sp, M_VMPGDATA);
 	return (0);
 }
 
 void
 swapoff_all(void)
 {
 	struct swdevt *sp, *spt;
 	const char *devname;
 	int error;
 
 	mtx_lock(&Giant);
 	while (swdev_syscall_active)
 		tsleep(&swdev_syscall_active, PUSER - 1, "swpoff", 0);
 	swdev_syscall_active = 1;
 
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_FOREACH_SAFE(sp, &swtailq, sw_list, spt) {
 		mtx_unlock(&sw_dev_mtx);
 		if (vn_isdisk(sp->sw_vp, NULL))
 			devname = devtoname(sp->sw_vp->v_rdev);
 		else
 			devname = "[file]";
 		error = swapoff_one(sp, thread0.td_ucred);
 		if (error != 0) {
 			printf("Cannot remove swap device %s (error=%d), "
 			    "skipping.\n", devname, error);
 		} else if (bootverbose) {
 			printf("Swap device %s removed.\n", devname);
 		}
 		mtx_lock(&sw_dev_mtx);
 	}
 	mtx_unlock(&sw_dev_mtx);
 
 	swdev_syscall_active = 0;
 	wakeup_one(&swdev_syscall_active);
 	mtx_unlock(&Giant);
 }
 
 void
 swap_pager_status(int *total, int *used)
 {
 	struct swdevt *sp;
 
 	*total = 0;
 	*used = 0;
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_FOREACH(sp, &swtailq, sw_list) {
 		*total += sp->sw_nblks;
 		*used += sp->sw_used;
 	}
 	mtx_unlock(&sw_dev_mtx);
 }
 
 int
 swap_dev_info(int name, struct xswdev *xs, char *devname, size_t len)
 {
 	struct swdevt *sp;
 	const char *tmp_devname;
 	int error, n;
 
 	n = 0;
 	error = ENOENT;
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_FOREACH(sp, &swtailq, sw_list) {
 		if (n != name) {
 			n++;
 			continue;
 		}
 		xs->xsw_version = XSWDEV_VERSION;
 		xs->xsw_dev = sp->sw_dev;
 		xs->xsw_flags = sp->sw_flags;
 		xs->xsw_nblks = sp->sw_nblks;
 		xs->xsw_used = sp->sw_used;
 		if (devname != NULL) {
 			if (vn_isdisk(sp->sw_vp, NULL))
 				tmp_devname = devtoname(sp->sw_vp->v_rdev);
 			else
 				tmp_devname = "[file]";
 			strncpy(devname, tmp_devname, len);
 		}
 		error = 0;
 		break;
 	}
 	mtx_unlock(&sw_dev_mtx);
 	return (error);
 }
 
 static int
 sysctl_vm_swap_info(SYSCTL_HANDLER_ARGS)
 {
 	struct xswdev xs;
 	int error;
 
 	if (arg2 != 1)			/* name length */
 		return (EINVAL);
 	error = swap_dev_info(*(int *)arg1, &xs, NULL, 0);
 	if (error != 0)
 		return (error);
 	error = SYSCTL_OUT(req, &xs, sizeof(xs));
 	return (error);
 }
 
 SYSCTL_INT(_vm, OID_AUTO, nswapdev, CTLFLAG_RD, &nswapdev, 0,
     "Number of swap devices");
 SYSCTL_NODE(_vm, OID_AUTO, swap_info, CTLFLAG_RD, sysctl_vm_swap_info,
     "Swap statistics by device");
 
 /*
  * vmspace_swap_count() - count the approximate swap usage in pages for a
  *			  vmspace.
  *
  *	The map must be locked.
  *
  *	Swap usage is determined by taking the proportional swap used by
  *	VM objects backing the VM map.  To make up for fractional losses,
  *	if the VM object has any swap use at all the associated map entries
  *	count for at least 1 swap page.
  */
 long
 vmspace_swap_count(struct vmspace *vmspace)
 {
 	vm_map_t map;
 	vm_map_entry_t cur;
 	vm_object_t object;
 	long count, n;
 
 	map = &vmspace->vm_map;
 	count = 0;
 
 	for (cur = map->header.next; cur != &map->header; cur = cur->next) {
 		if ((cur->eflags & MAP_ENTRY_IS_SUB_MAP) == 0 &&
 		    (object = cur->object.vm_object) != NULL) {
 			VM_OBJECT_LOCK(object);
 			if (object->type == OBJT_SWAP &&
 			    object->un_pager.swp.swp_bcount != 0) {
 				n = (cur->end - cur->start) / PAGE_SIZE;
 				count += object->un_pager.swp.swp_bcount *
 				    SWAP_META_PAGES * n / object->size + 1;
 			}
 			VM_OBJECT_UNLOCK(object);
 		}
 	}
 	return (count);
 }
 
 /*
  * GEOM backend
  *
  * Swapping onto disk devices.
  *
  */
 
 static g_orphan_t swapgeom_orphan;
 
 static struct g_class g_swap_class = {
 	.name = "SWAP",
 	.version = G_VERSION,
 	.orphan = swapgeom_orphan,
 };
 
 DECLARE_GEOM_CLASS(g_swap_class, g_class);
 
 
 static void
 swapgeom_done(struct bio *bp2)
 {
 	struct buf *bp;
 
 	bp = bp2->bio_caller2;
 	bp->b_ioflags = bp2->bio_flags;
 	if (bp2->bio_error)
 		bp->b_ioflags |= BIO_ERROR;
 	bp->b_resid = bp->b_bcount - bp2->bio_completed;
 	bp->b_error = bp2->bio_error;
 	bufdone(bp);
 	g_destroy_bio(bp2);
 }
 
 static void
 swapgeom_strategy(struct buf *bp, struct swdevt *sp)
 {
 	struct bio *bio;
 	struct g_consumer *cp;
 
 	cp = sp->sw_id;
 	if (cp == NULL) {
 		bp->b_error = ENXIO;
 		bp->b_ioflags |= BIO_ERROR;
 		bufdone(bp);
 		return;
 	}
 	if (bp->b_iocmd == BIO_WRITE)
 		bio = g_new_bio();
 	else
 		bio = g_alloc_bio();
 	if (bio == NULL) {
 		bp->b_error = ENOMEM;
 		bp->b_ioflags |= BIO_ERROR;
 		bufdone(bp);
 		return;
 	}
 
 	bio->bio_caller2 = bp;
 	bio->bio_cmd = bp->b_iocmd;
 	bio->bio_data = bp->b_data;
 	bio->bio_offset = (bp->b_blkno - sp->sw_first) * PAGE_SIZE;
 	bio->bio_length = bp->b_bcount;
 	bio->bio_done = swapgeom_done;
 	g_io_request(bio, cp);
 	return;
 }
 
 static void
 swapgeom_orphan(struct g_consumer *cp)
 {
 	struct swdevt *sp;
 
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_FOREACH(sp, &swtailq, sw_list)
 		if (sp->sw_id == cp)
 			sp->sw_flags |= SW_CLOSING;
 	mtx_unlock(&sw_dev_mtx);
 }
 
 static void
 swapgeom_close_ev(void *arg, int flags)
 {
 	struct g_consumer *cp;
 
 	cp = arg;
 	g_access(cp, -1, -1, 0);
 	g_detach(cp);
 	g_destroy_consumer(cp);
 }
 
 static void
 swapgeom_close(struct thread *td, struct swdevt *sw)
 {
 
 	/* XXX: direct call when Giant untangled */
 	g_waitfor_event(swapgeom_close_ev, sw->sw_id, M_WAITOK, NULL);
 }
 
 
 struct swh0h0 {
 	struct cdev *dev;
 	struct vnode *vp;
 	int	error;
 };
 
 static void
 swapongeom_ev(void *arg, int flags)
 {
 	struct swh0h0 *swh;
 	struct g_provider *pp;
 	struct g_consumer *cp;
 	static struct g_geom *gp;
 	struct swdevt *sp;
 	u_long nblks;
 	int error;
 
 	swh = arg;
 	swh->error = 0;
 	pp = g_dev_getprovider(swh->dev);
 	if (pp == NULL) {
 		swh->error = ENODEV;
 		return;
 	}
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_FOREACH(sp, &swtailq, sw_list) {
 		cp = sp->sw_id;
 		if (cp != NULL && cp->provider == pp) {
 			mtx_unlock(&sw_dev_mtx);
 			swh->error = EBUSY;
 			return;
 		}
 	}
 	mtx_unlock(&sw_dev_mtx);
 	if (gp == NULL)
 		gp = g_new_geomf(&g_swap_class, "swap");
 	cp = g_new_consumer(gp);
 	g_attach(cp, pp);
 	/*
 	 * XXX: Everytime you think you can improve the margin for
 	 * footshooting, somebody depends on the ability to do so:
 	 * savecore(8) wants to write to our swapdev so we cannot
 	 * set an exclusive count :-(
 	 */
 	error = g_access(cp, 1, 1, 0);
 	if (error) {
 		g_detach(cp);
 		g_destroy_consumer(cp);
 		swh->error = error;
 		return;
 	}
 	nblks = pp->mediasize / DEV_BSIZE;
 	swaponsomething(swh->vp, cp, nblks, swapgeom_strategy,
 	    swapgeom_close, dev2udev(swh->dev));
 	swh->error = 0;
 	return;
 }
 
 static int
 swapongeom(struct thread *td, struct vnode *vp)
 {
 	int error;
 	struct swh0h0 swh;
 
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 
 	swh.dev = vp->v_rdev;
 	swh.vp = vp;
 	swh.error = 0;
 	/* XXX: direct call when Giant untangled */
 	error = g_waitfor_event(swapongeom_ev, &swh, M_WAITOK, NULL);
 	if (!error)
 		error = swh.error;
 	VOP_UNLOCK(vp, 0);
 	return (error);
 }
 
 /*
  * VNODE backend
  *
  * This is used mainly for network filesystem (read: probably only tested
  * with NFS) swapfiles.
  *
  */
 
 static void
 swapdev_strategy(struct buf *bp, struct swdevt *sp)
 {
 	struct vnode *vp2;
 
 	bp->b_blkno = ctodb(bp->b_blkno - sp->sw_first);
 
 	vp2 = sp->sw_id;
 	vhold(vp2);
 	if (bp->b_iocmd == BIO_WRITE) {
 		if (bp->b_bufobj)
 			bufobj_wdrop(bp->b_bufobj);
 		bufobj_wref(&vp2->v_bufobj);
 	}
 	if (bp->b_bufobj != &vp2->v_bufobj)
 		bp->b_bufobj = &vp2->v_bufobj;
 	bp->b_vp = vp2;
 	bp->b_iooffset = dbtob(bp->b_blkno);
 	bstrategy(bp);
 	return;
 }
 
 static void
 swapdev_close(struct thread *td, struct swdevt *sp)
 {
 
 	VOP_CLOSE(sp->sw_vp, FREAD | FWRITE, td->td_ucred, td);
 	vrele(sp->sw_vp);
 }
 
 
 static int
 swaponvp(struct thread *td, struct vnode *vp, u_long nblks)
 {
 	struct swdevt *sp;
 	int error;
 
 	if (nblks == 0)
 		return (ENXIO);
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_FOREACH(sp, &swtailq, sw_list) {
 		if (sp->sw_id == vp) {
 			mtx_unlock(&sw_dev_mtx);
 			return (EBUSY);
 		}
 	}
 	mtx_unlock(&sw_dev_mtx);
 
 	(void) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 #ifdef MAC
 	error = mac_system_check_swapon(td->td_ucred, vp);
 	if (error == 0)
 #endif
 		error = VOP_OPEN(vp, FREAD | FWRITE, td->td_ucred, td, NULL);
 	(void) VOP_UNLOCK(vp, 0);
 	if (error)
 		return (error);
 
 	swaponsomething(vp, vp, nblks, swapdev_strategy, swapdev_close,
 	    NODEV);
 	return (0);
 }
Index: user/attilio/vmc-playground/sys/vm/uma.h
===================================================================
--- user/attilio/vmc-playground/sys/vm/uma.h	(revision 247223)
+++ user/attilio/vmc-playground/sys/vm/uma.h	(revision 247224)
@@ -1,654 +1,651 @@
 /*-
  * Copyright (c) 2002, 2003, 2004, 2005 Jeffrey Roberson <jeff@FreeBSD.org>
  * Copyright (c) 2004, 2005 Bosko Milekic <bmilekic@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * $FreeBSD$
  *
  */
 
 /*
  * uma.h - External definitions for the Universal Memory Allocator
  *
 */
 
 #ifndef VM_UMA_H
 #define VM_UMA_H
 
 #include <sys/param.h>		/* For NULL */
 #include <sys/malloc.h>		/* For M_* */
 
 /* User visible parameters */
 #define UMA_SMALLEST_UNIT       (PAGE_SIZE / 256) /* Smallest item allocated */
 
 /* Types and type defs */
 
 struct uma_zone;
 /* Opaque type used as a handle to the zone */
 typedef struct uma_zone * uma_zone_t;
 
 void zone_drain(uma_zone_t);
 
 /*
  * Item constructor
  *
  * Arguments:
  *	item  A pointer to the memory which has been allocated.
  *	arg   The arg field passed to uma_zalloc_arg
  *	size  The size of the allocated item
  *	flags See zalloc flags
  *
  * Returns:
  *	0      on success
  *      errno  on failure
  *
  * Discussion:
  *	The constructor is called just before the memory is returned
  *	to the user. It may block if necessary.
  */
 typedef int (*uma_ctor)(void *mem, int size, void *arg, int flags);
 
 /*
  * Item destructor
  *
  * Arguments:
  *	item  A pointer to the memory which has been allocated.
  *	size  The size of the item being destructed.
  *	arg   Argument passed through uma_zfree_arg
  *
  * Returns:
  *	Nothing
  *
  * Discussion:
  *	The destructor may perform operations that differ from those performed
  *	by the initializer, but it must leave the object in the same state.
  *	This IS type stable storage.  This is called after EVERY zfree call.
  */
 typedef void (*uma_dtor)(void *mem, int size, void *arg);
 
 /*
  * Item initializer
  *
  * Arguments:
  *	item  A pointer to the memory which has been allocated.
  *	size  The size of the item being initialized.
  *	flags See zalloc flags
  *
  * Returns:
  *	0      on success
  *      errno  on failure
  *
  * Discussion:
  *	The initializer is called when the memory is cached in the uma zone.
  *	The initializer and the destructor should leave the object in the same
  *	state.
  */
 typedef int (*uma_init)(void *mem, int size, int flags);
 
 /*
  * Item discard function
  *
  * Arguments:
  *	item  A pointer to memory which has been 'freed' but has not left the
  *	      zone's cache.
  *	size  The size of the item being discarded.
  *
  * Returns:
  *	Nothing
  *
  * Discussion:
  *	This routine is called when memory leaves a zone and is returned to the
  *	system for other uses.  It is the counter-part to the init function.
  */
 typedef void (*uma_fini)(void *mem, int size);
 
 /*
  * What's the difference between initializing and constructing?
  *
  * The item is initialized when it is cached, and this is the state that the
  * object should be in when returned to the allocator. The purpose of this is
  * to remove some code which would otherwise be called on each allocation by
  * utilizing a known, stable state.  This differs from the constructor which
  * will be called on EVERY allocation.
  *
  * For example, in the initializer you may want to initialize embedded locks,
  * NULL list pointers, set up initial states, magic numbers, etc.  This way if
  * the object is held in the allocator and re-used it won't be necessary to
  * re-initialize it.
  *
  * The constructor may be used to lock a data structure, link it on to lists,
  * bump reference counts or total counts of outstanding structures, etc.
  *
  */
 
 
 /* Function proto types */
 
 /*
  * Create a new uma zone
  *
  * Arguments:
  *	name  The text name of the zone for debugging and stats. This memory
  *		should not be freed until the zone has been deallocated.
  *	size  The size of the object that is being created.
  *	ctor  The constructor that is called when the object is allocated.
  *	dtor  The destructor that is called when the object is freed.
  *	init  An initializer that sets up the initial state of the memory.
  *	fini  A discard function that undoes initialization done by init.
  *		ctor/dtor/init/fini may all be null, see notes above.
  *	align A bitmask that corresponds to the requested alignment
  *		eg 4 would be 0x3
  *	flags A set of parameters that control the behavior of the zone.
  *
  * Returns:
  *	A pointer to a structure which is intended to be opaque to users of
  *	the interface.  The value may be null if the wait flag is not set.
  */
 uma_zone_t uma_zcreate(const char *name, size_t size, uma_ctor ctor,
 		    uma_dtor dtor, uma_init uminit, uma_fini fini,
 		    int align, u_int32_t flags);
 
 /*
  * Create a secondary uma zone
  *
  * Arguments:
  *	name  The text name of the zone for debugging and stats. This memory
  *		should not be freed until the zone has been deallocated.
  *	ctor  The constructor that is called when the object is allocated.
  *	dtor  The destructor that is called when the object is freed.
  *	zinit  An initializer that sets up the initial state of the memory
  *		as the object passes from the Keg's slab to the Zone's cache.
  *	zfini  A discard function that undoes initialization done by init
  *		as the object passes from the Zone's cache to the Keg's slab.
  *
  *		ctor/dtor/zinit/zfini may all be null, see notes above.
  *		Note that the zinit and zfini specified here are NOT
  *		exactly the same as the init/fini specified to uma_zcreate()
  *		when creating a master zone.  These zinit/zfini are called
  *		on the TRANSITION from keg to zone (and vice-versa). Once
  *		these are set, the primary zone may alter its init/fini
  *		(which are called when the object passes from VM to keg)
  *		using uma_zone_set_init/fini()) as well as its own
  *		zinit/zfini (unset by default for master zone) with
  *		uma_zone_set_zinit/zfini() (note subtle 'z' prefix).
  *
  *	master  A reference to this zone's Master Zone (Primary Zone),
  *		which contains the backing Keg for the Secondary Zone
  *		being added.
  *
  * Returns:
  *	A pointer to a structure which is intended to be opaque to users of
  *	the interface.  The value may be null if the wait flag is not set.
  */
 uma_zone_t uma_zsecond_create(char *name, uma_ctor ctor, uma_dtor dtor,
 		    uma_init zinit, uma_fini zfini, uma_zone_t master);
 
 /*
  * Add a second master to a secondary zone.  This provides multiple data
  * backends for objects with the same size.  Both masters must have
  * compatible allocation flags.  Presently, UMA_ZONE_MALLOC type zones are
  * the only supported.
  *
  * Returns:
  *	Error on failure, 0 on success.
  */
 int uma_zsecond_add(uma_zone_t zone, uma_zone_t master);
 
 /*
  * Definitions for uma_zcreate flags
  *
  * These flags share space with UMA_ZFLAGs in uma_int.h.  Be careful not to
  * overlap when adding new features.  0xf0000000 is in use by uma_int.h.
  */
 #define UMA_ZONE_PAGEABLE	0x0001	/* Return items not fully backed by
 					   physical memory XXX Not yet */
 #define UMA_ZONE_ZINIT		0x0002	/* Initialize with zeros */
 #define UMA_ZONE_STATIC		0x0004	/* Statically sized zone */
 #define UMA_ZONE_OFFPAGE	0x0008	/* Force the slab structure allocation
 					   off of the real memory */
 #define UMA_ZONE_MALLOC		0x0010	/* For use by malloc(9) only! */
 #define UMA_ZONE_NOFREE		0x0020	/* Do not free slabs of this type! */
 #define UMA_ZONE_MTXCLASS	0x0040	/* Create a new lock class */
 #define	UMA_ZONE_VM		0x0080	/*
 					 * Used for internal vm datastructures
 					 * only.
 					 */
 #define	UMA_ZONE_HASH		0x0100	/*
 					 * Use a hash table instead of caching
 					 * information in the vm_page.
 					 */
 #define	UMA_ZONE_SECONDARY	0x0200	/* Zone is a Secondary Zone */
 #define	UMA_ZONE_REFCNT		0x0400	/* Allocate refcnts in slabs */
 #define	UMA_ZONE_MAXBUCKET	0x0800	/* Use largest buckets */
 #define	UMA_ZONE_CACHESPREAD	0x1000	/*
 					 * Spread memory start locations across
 					 * all possible cache lines.  May
 					 * require many virtually contiguous
 					 * backend pages and can fail early.
 					 */
 #define	UMA_ZONE_VTOSLAB	0x2000	/* Zone uses vtoslab for lookup. */
 #define	UMA_ZONE_NODUMP		0x4000	/*
 					 * Zone's pages will not be included in
 					 * mini-dumps.
 					 */
 
 /*
  * These flags are shared between the keg and zone.  In zones wishing to add
  * new kegs these flags must be compatible.  Some are determined based on
  * physical parameters of the request and may not be provided by the consumer.
  */
 #define	UMA_ZONE_INHERIT						\
     (UMA_ZONE_OFFPAGE | UMA_ZONE_MALLOC | UMA_ZONE_NOFREE |		\
     UMA_ZONE_HASH | UMA_ZONE_REFCNT | UMA_ZONE_VTOSLAB)
 
 /* Definitions for align */
 #define UMA_ALIGN_PTR	(sizeof(void *) - 1)	/* Alignment fit for ptr */
 #define UMA_ALIGN_LONG	(sizeof(long) - 1)	/* "" long */
 #define UMA_ALIGN_INT	(sizeof(int) - 1)	/* "" int */
 #define UMA_ALIGN_SHORT	(sizeof(short) - 1)	/* "" short */
 #define UMA_ALIGN_CHAR	(sizeof(char) - 1)	/* "" char */
 #define UMA_ALIGN_CACHE	(0 - 1)			/* Cache line size align */
 
 /*
  * Destroys an empty uma zone.  If the zone is not empty uma complains loudly.
  *
  * Arguments:
  *	zone  The zone we want to destroy.
  *
  */
 void uma_zdestroy(uma_zone_t zone);
 
 /*
  * Allocates an item out of a zone
  *
  * Arguments:
  *	zone  The zone we are allocating from
  *	arg   This data is passed to the ctor function
  *	flags See sys/malloc.h for available flags.
  *
  * Returns:
  *	A non-null pointer to an initialized element from the zone is
  *	guaranteed if the wait flag is M_WAITOK.  Otherwise a null pointer
  *	may be returned if the zone is empty or the ctor failed.
  */
 
 void *uma_zalloc_arg(uma_zone_t zone, void *arg, int flags);
 
 /*
  * Allocates an item out of a zone without supplying an argument
  *
  * This is just a wrapper for uma_zalloc_arg for convenience.
  *
  */
 static __inline void *uma_zalloc(uma_zone_t zone, int flags);
 
 static __inline void *
 uma_zalloc(uma_zone_t zone, int flags)
 {
 	return uma_zalloc_arg(zone, NULL, flags);
 }
 
 /*
  * Frees an item back into the specified zone.
  *
  * Arguments:
  *	zone  The zone the item was originally allocated out of.
  *	item  The memory to be freed.
  *	arg   Argument passed to the destructor
  *
  * Returns:
  *	Nothing.
  */
 
 void uma_zfree_arg(uma_zone_t zone, void *item, void *arg);
 
 /*
  * Frees an item back to a zone without supplying an argument
  *
  * This is just a wrapper for uma_zfree_arg for convenience.
  *
  */
 static __inline void uma_zfree(uma_zone_t zone, void *item);
 
 static __inline void
 uma_zfree(uma_zone_t zone, void *item)
 {
 	uma_zfree_arg(zone, item, NULL);
 }
 
 /*
  * XXX The rest of the prototypes in this header are h0h0 magic for the VM.
  * If you think you need to use it for a normal zone you're probably incorrect.
  */
 
 /*
  * Backend page supplier routines
  *
  * Arguments:
  *	zone  The zone that is requesting pages.
  *	size  The number of bytes being requested.
  *	pflag Flags for these memory pages, see below.
  *	wait  Indicates our willingness to block.
  *
  * Returns:
  *	A pointer to the allocated memory or NULL on failure.
  */
 
 typedef void *(*uma_alloc)(uma_zone_t zone, int size, u_int8_t *pflag, int wait);
 
 /*
  * Backend page free routines
  *
  * Arguments:
  *	item  A pointer to the previously allocated pages.
  *	size  The original size of the allocation.
  *	pflag The flags for the slab.  See UMA_SLAB_* below.
  *
  * Returns:
  *	None
  */
 typedef void (*uma_free)(void *item, int size, u_int8_t pflag);
 
 
 
 /*
  * Sets up the uma allocator. (Called by vm_mem_init)
  *
  * Arguments:
  *	bootmem  A pointer to memory used to bootstrap the system.
  *
  * Returns:
  *	Nothing
  *
  * Discussion:
  *	This memory is used for zones which allocate things before the
  *	backend page supplier can give us pages.  It should be
  *	UMA_SLAB_SIZE * boot_pages bytes. (see uma_int.h)
  *
  */
 
 void uma_startup(void *bootmem, int boot_pages);
 
 /*
  * Finishes starting up the allocator.  This should
  * be called when kva is ready for normal allocs.
  *
  * Arguments:
  *	None
  *
  * Returns:
  *	Nothing
  *
  * Discussion:
  *	uma_startup2 is called by kmeminit() to enable us of uma for malloc.
  */
 
 void uma_startup2(void);
 
 /*
  * Reclaims unused memory for all zones
  *
  * Arguments:
  *	None
  * Returns:
  *	None
  *
  * This should only be called by the page out daemon.
  */
 
 void uma_reclaim(void);
 
 /*
  * Sets the alignment mask to be used for all zones requesting cache
  * alignment.  Should be called by MD boot code prior to starting VM/UMA.
  *
  * Arguments:
  *	align The alignment mask
  *
  * Returns:
  *	Nothing
  */
 void uma_set_align(int align);
 
 /*
- * Switches the backing object of a zone
+ * Switches the backing object of a zone to VM_ALLOC_NOOBJ.
  *
  * Arguments:
  *	zone  The zone to update.
- *	obj   The VM object to use for future allocations.
- *	size  The size of the object to allocate.
+ *	nitems  The number of items previewed to be allocated.
  *
  * Returns:
  *	0  if kva space can not be allocated
  *	1  if successful
  *
  * Discussion:
- *	A NULL object can be used and uma will allocate one for you.  Setting
- *	the size will limit the amount of memory allocated to this zone.
+ *	The size will limit the amount of memory allocated to this zone.
  *
  */
-struct vm_object;
-int uma_zone_set_obj(uma_zone_t zone, struct vm_object *obj, int size);
+int uma_zone_reserve_kva(uma_zone_t zone, int nitems);
 
 /*
  * Sets a high limit on the number of items allowed in a zone
  *
  * Arguments:
  *	zone  The zone to limit
  *	nitems  The requested upper limit on the number of items allowed
  *
  * Returns:
  *	int  The effective value of nitems after rounding up based on page size
  */
 int uma_zone_set_max(uma_zone_t zone, int nitems);
 
 /*
  * Obtains the effective limit on the number of items in a zone
  *
  * Arguments:
  *	zone  The zone to obtain the effective limit from
  *
  * Return:
  *	0  No limit
  *	int  The effective limit of the zone
  */
 int uma_zone_get_max(uma_zone_t zone);
 
 /*
  * Sets a warning to be printed when limit is reached
  *
  * Arguments:
  *	zone  The zone we will warn about
  *	warning  Warning content
  *
  * Returns:
  *	Nothing
  */
 void uma_zone_set_warning(uma_zone_t zone, const char *warning);
 
 /*
  * Obtains the approximate current number of items allocated from a zone
  *
  * Arguments:
  *	zone  The zone to obtain the current allocation count from
  *
  * Return:
  *	int  The approximate current number of items allocated from the zone
  */
 int uma_zone_get_cur(uma_zone_t zone);
 
 /*
  * The following two routines (uma_zone_set_init/fini)
  * are used to set the backend init/fini pair which acts on an
  * object as it becomes allocated and is placed in a slab within
  * the specified zone's backing keg.  These should probably not
  * be changed once allocations have already begun, but only be set
  * immediately upon zone creation.
  */
 void uma_zone_set_init(uma_zone_t zone, uma_init uminit);
 void uma_zone_set_fini(uma_zone_t zone, uma_fini fini);
 
 /*
  * The following two routines (uma_zone_set_zinit/zfini) are
  * used to set the zinit/zfini pair which acts on an object as
  * it passes from the backing Keg's slab cache to the
  * specified Zone's bucket cache.  These should probably not
  * be changed once allocations have already begun, but only be set
  * immediately upon zone creation.
  */
 void uma_zone_set_zinit(uma_zone_t zone, uma_init zinit);
 void uma_zone_set_zfini(uma_zone_t zone, uma_fini zfini);
 
 /*
  * Replaces the standard page_alloc or obj_alloc functions for this zone
  *
  * Arguments:
  *	zone   The zone whose backend allocator is being changed.
  *	allocf A pointer to the allocation function
  *
  * Returns:
  *	Nothing
  *
  * Discussion:
  *	This could be used to implement pageable allocation, or perhaps
  *	even DMA allocators if used in conjunction with the OFFPAGE
  *	zone flag.
  */
 
 void uma_zone_set_allocf(uma_zone_t zone, uma_alloc allocf);
 
 /*
  * Used for freeing memory provided by the allocf above
  *
  * Arguments:
  *	zone  The zone that intends to use this free routine.
  *	freef The page freeing routine.
  *
  * Returns:
  *	Nothing
  */
 
 void uma_zone_set_freef(uma_zone_t zone, uma_free freef);
 
 /*
  * These flags are setable in the allocf and visible in the freef.
  */
 #define UMA_SLAB_BOOT	0x01		/* Slab alloced from boot pages */
 #define UMA_SLAB_KMEM	0x02		/* Slab alloced from kmem_map */
 #define UMA_SLAB_KERNEL	0x04		/* Slab alloced from kernel_map */
 #define UMA_SLAB_PRIV	0x08		/* Slab alloced from priv allocator */
 #define UMA_SLAB_OFFP	0x10		/* Slab is managed separately  */
 #define UMA_SLAB_MALLOC	0x20		/* Slab is a large malloc slab */
 /* 0x40 and 0x80 are available */
 
 /*
  * Used to pre-fill a zone with some number of items
  *
  * Arguments:
  *	zone    The zone to fill
  *	itemcnt The number of items to reserve
  *
  * Returns:
  *	Nothing
  *
  * NOTE: This is blocking and should only be done at startup
  */
 void uma_prealloc(uma_zone_t zone, int itemcnt);
 
 /*
  * Used to lookup the reference counter allocated for an item
  * from a UMA_ZONE_REFCNT zone.  For UMA_ZONE_REFCNT zones,
  * reference counters are allocated for items and stored in
  * the underlying slab header.
  *
  * Arguments:
  *	zone  The UMA_ZONE_REFCNT zone to which the item belongs.
  *	item  The address of the item for which we want a refcnt.
  *
  * Returns:
  *	A pointer to a u_int32_t reference counter.
  */
 u_int32_t *uma_find_refcnt(uma_zone_t zone, void *item);
 
 /*
  * Used to determine if a fixed-size zone is exhausted.
  *
  * Arguments:
  *	zone    The zone to check
  *
  * Returns:
  *	Non-zero if zone is exhausted.
  */
 int uma_zone_exhausted(uma_zone_t zone);
 int uma_zone_exhausted_nolock(uma_zone_t zone);
 
 /*
  * Exported statistics structures to be used by user space monitoring tools.
  * Statistics stream consists of a uma_stream_header, followed by a series of
  * alternative uma_type_header and uma_type_stat structures.
  */
 #define	UMA_STREAM_VERSION	0x00000001
 struct uma_stream_header {
 	u_int32_t	ush_version;	/* Stream format version. */
 	u_int32_t	ush_maxcpus;	/* Value of MAXCPU for stream. */
 	u_int32_t	ush_count;	/* Number of records. */
 	u_int32_t	_ush_pad;	/* Pad/reserved field. */
 };
 
 #define	UTH_MAX_NAME	32
 #define	UTH_ZONE_SECONDARY	0x00000001
 struct uma_type_header {
 	/*
 	 * Static per-zone data, some extracted from the supporting keg.
 	 */
 	char		uth_name[UTH_MAX_NAME];
 	u_int32_t	uth_align;	/* Keg: alignment. */
 	u_int32_t	uth_size;	/* Keg: requested size of item. */
 	u_int32_t	uth_rsize;	/* Keg: real size of item. */
 	u_int32_t	uth_maxpages;	/* Keg: maximum number of pages. */
 	u_int32_t	uth_limit;	/* Keg: max items to allocate. */
 
 	/*
 	 * Current dynamic zone/keg-derived statistics.
 	 */
 	u_int32_t	uth_pages;	/* Keg: pages allocated. */
 	u_int32_t	uth_keg_free;	/* Keg: items free. */
 	u_int32_t	uth_zone_free;	/* Zone: items free. */
 	u_int32_t	uth_bucketsize;	/* Zone: desired bucket size. */
 	u_int32_t	uth_zone_flags;	/* Zone: flags. */
 	u_int64_t	uth_allocs;	/* Zone: number of allocations. */
 	u_int64_t	uth_frees;	/* Zone: number of frees. */
 	u_int64_t	uth_fails;	/* Zone: number of alloc failures. */
 	u_int64_t	uth_sleeps;	/* Zone: number of alloc sleeps. */
 	u_int64_t	_uth_reserved1[2];	/* Reserved. */
 };
 
 struct uma_percpu_stat {
 	u_int64_t	ups_allocs;	/* Cache: number of allocations. */
 	u_int64_t	ups_frees;	/* Cache: number of frees. */
 	u_int64_t	ups_cache_free;	/* Cache: free items in cache. */
 	u_int64_t	_ups_reserved[5];	/* Reserved. */
 };
 
 #endif
Index: user/attilio/vmc-playground/sys/vm/uma_core.c
===================================================================
--- user/attilio/vmc-playground/sys/vm/uma_core.c	(revision 247223)
+++ user/attilio/vmc-playground/sys/vm/uma_core.c	(revision 247224)
@@ -1,3424 +1,3432 @@
 /*-
  * Copyright (c) 2002-2005, 2009 Jeffrey Roberson <jeff@FreeBSD.org>
  * Copyright (c) 2004, 2005 Bosko Milekic <bmilekic@FreeBSD.org>
  * Copyright (c) 2004-2006 Robert N. M. Watson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 /*
  * uma_core.c  Implementation of the Universal Memory allocator
  *
  * This allocator is intended to replace the multitude of similar object caches
  * in the standard FreeBSD kernel.  The intent is to be flexible as well as
  * effecient.  A primary design goal is to return unused memory to the rest of
  * the system.  This will make the system as a whole more flexible due to the
  * ability to move memory to subsystems which most need it instead of leaving
  * pools of reserved memory unused.
  *
  * The basic ideas stem from similar slab/zone based allocators whose algorithms
  * are well known.
  *
  */
 
 /*
  * TODO:
  *	- Improve memory usage for large allocations
  *	- Investigate cache size adjustments
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /* I should really use ktr.. */
 /*
 #define UMA_DEBUG 1
 #define UMA_DEBUG_ALLOC 1
 #define UMA_DEBUG_ALLOC_1 1
 */
 
 #include "opt_ddb.h"
 #include "opt_param.h"
 #include "opt_vm.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/types.h>
 #include <sys/queue.h>
 #include <sys/malloc.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/sysctl.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/sbuf.h>
 #include <sys/smp.h>
 #include <sys/vmmeter.h>
 
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
+#include <vm/vm_pageout.h>
 #include <vm/vm_param.h>
 #include <vm/vm_map.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 #include <vm/uma.h>
 #include <vm/uma_int.h>
 #include <vm/uma_dbg.h>
 
 #include <ddb/ddb.h>
 
 #ifdef DEBUG_MEMGUARD
 #include <vm/memguard.h>
 #endif
 
 /*
  * This is the zone and keg from which all zones are spawned.  The idea is that
  * even the zone & keg heads are allocated from the allocator, so we use the
  * bss section to bootstrap us.
  */
 static struct uma_keg masterkeg;
 static struct uma_zone masterzone_k;
 static struct uma_zone masterzone_z;
 static uma_zone_t kegs = &masterzone_k;
 static uma_zone_t zones = &masterzone_z;
 
 /* This is the zone from which all of uma_slab_t's are allocated. */
 static uma_zone_t slabzone;
 static uma_zone_t slabrefzone;	/* With refcounters (for UMA_ZONE_REFCNT) */
 
 /*
  * The initial hash tables come out of this zone so they can be allocated
  * prior to malloc coming up.
  */
 static uma_zone_t hashzone;
 
 /* The boot-time adjusted value for cache line alignment. */
 int uma_align_cache = 64 - 1;
 
 static MALLOC_DEFINE(M_UMAHASH, "UMAHash", "UMA Hash Buckets");
 
 /*
  * Are we allowed to allocate buckets?
  */
 static int bucketdisable = 1;
 
 /* Linked list of all kegs in the system */
 static LIST_HEAD(,uma_keg) uma_kegs = LIST_HEAD_INITIALIZER(uma_kegs);
 
 /* This mutex protects the keg list */
 static struct mtx uma_mtx;
 
 /* Linked list of boot time pages */
 static LIST_HEAD(,uma_slab) uma_boot_pages =
     LIST_HEAD_INITIALIZER(uma_boot_pages);
 
 /* This mutex protects the boot time pages list */
 static struct mtx uma_boot_pages_mtx;
 
 /* Is the VM done starting up? */
 static int booted = 0;
 #define	UMA_STARTUP	1
 #define	UMA_STARTUP2	2
 
 /* Maximum number of allowed items-per-slab if the slab header is OFFPAGE */
 static u_int uma_max_ipers;
 static u_int uma_max_ipers_ref;
 
 /*
  * This is the handle used to schedule events that need to happen
  * outside of the allocation fast path.
  */
 static struct callout uma_callout;
 #define	UMA_TIMEOUT	20		/* Seconds for callout interval. */
 
 /*
  * This structure is passed as the zone ctor arg so that I don't have to create
  * a special allocation function just for zones.
  */
 struct uma_zctor_args {
 	const char *name;
 	size_t size;
 	uma_ctor ctor;
 	uma_dtor dtor;
 	uma_init uminit;
 	uma_fini fini;
 	uma_keg_t keg;
 	int align;
 	u_int32_t flags;
 };
 
 struct uma_kctor_args {
 	uma_zone_t zone;
 	size_t size;
 	uma_init uminit;
 	uma_fini fini;
 	int align;
 	u_int32_t flags;
 };
 
 struct uma_bucket_zone {
 	uma_zone_t	ubz_zone;
 	char		*ubz_name;
 	int		ubz_entries;
 };
 
 #define	BUCKET_MAX	128
 
 struct uma_bucket_zone bucket_zones[] = {
 	{ NULL, "16 Bucket", 16 },
 	{ NULL, "32 Bucket", 32 },
 	{ NULL, "64 Bucket", 64 },
 	{ NULL, "128 Bucket", 128 },
 	{ NULL, NULL, 0}
 };
 
 #define	BUCKET_SHIFT	4
 #define	BUCKET_ZONES	((BUCKET_MAX >> BUCKET_SHIFT) + 1)
 
 /*
  * bucket_size[] maps requested bucket sizes to zones that allocate a bucket
  * of approximately the right size.
  */
 static uint8_t bucket_size[BUCKET_ZONES];
 
 /*
  * Flags and enumerations to be passed to internal functions.
  */
 enum zfreeskip { SKIP_NONE, SKIP_DTOR, SKIP_FINI };
 
 #define	ZFREE_STATFAIL	0x00000001	/* Update zone failure statistic. */
 #define	ZFREE_STATFREE	0x00000002	/* Update zone free statistic. */
 
 /* Prototypes.. */
 
-static void *obj_alloc(uma_zone_t, int, u_int8_t *, int);
+static void *noobj_alloc(uma_zone_t, int, u_int8_t *, int);
 static void *page_alloc(uma_zone_t, int, u_int8_t *, int);
 static void *startup_alloc(uma_zone_t, int, u_int8_t *, int);
 static void page_free(void *, int, u_int8_t);
 static uma_slab_t keg_alloc_slab(uma_keg_t, uma_zone_t, int);
 static void cache_drain(uma_zone_t);
 static void bucket_drain(uma_zone_t, uma_bucket_t);
 static void bucket_cache_drain(uma_zone_t zone);
 static int keg_ctor(void *, int, void *, int);
 static void keg_dtor(void *, int, void *);
 static int zone_ctor(void *, int, void *, int);
 static void zone_dtor(void *, int, void *);
 static int zero_init(void *, int, int);
 static void keg_small_init(uma_keg_t keg);
 static void keg_large_init(uma_keg_t keg);
 static void zone_foreach(void (*zfunc)(uma_zone_t));
 static void zone_timeout(uma_zone_t zone);
 static int hash_alloc(struct uma_hash *);
 static int hash_expand(struct uma_hash *, struct uma_hash *);
 static void hash_free(struct uma_hash *hash);
 static void uma_timeout(void *);
 static void uma_startup3(void);
 static void *zone_alloc_item(uma_zone_t, void *, int);
 static void zone_free_item(uma_zone_t, void *, void *, enum zfreeskip,
     int);
 static void bucket_enable(void);
 static void bucket_init(void);
 static uma_bucket_t bucket_alloc(int, int);
 static void bucket_free(uma_bucket_t);
 static void bucket_zone_drain(void);
 static int zone_alloc_bucket(uma_zone_t zone, int flags);
 static uma_slab_t zone_fetch_slab(uma_zone_t zone, uma_keg_t last, int flags);
 static uma_slab_t zone_fetch_slab_multi(uma_zone_t zone, uma_keg_t last, int flags);
 static void *slab_alloc_item(uma_zone_t zone, uma_slab_t slab);
 static uma_keg_t uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit,
     uma_fini fini, int align, u_int32_t flags);
 static inline void zone_relock(uma_zone_t zone, uma_keg_t keg);
 static inline void keg_relock(uma_keg_t keg, uma_zone_t zone);
 
 void uma_print_zone(uma_zone_t);
 void uma_print_stats(void);
 static int sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS);
 static int sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS);
 
 SYSINIT(uma_startup3, SI_SUB_VM_CONF, SI_ORDER_SECOND, uma_startup3, NULL);
 
 SYSCTL_PROC(_vm, OID_AUTO, zone_count, CTLFLAG_RD|CTLTYPE_INT,
     0, 0, sysctl_vm_zone_count, "I", "Number of UMA zones");
 
 SYSCTL_PROC(_vm, OID_AUTO, zone_stats, CTLFLAG_RD|CTLTYPE_STRUCT,
     0, 0, sysctl_vm_zone_stats, "s,struct uma_type_header", "Zone Stats");
 
 static int zone_warnings = 1;
 TUNABLE_INT("vm.zone_warnings", &zone_warnings);
 SYSCTL_INT(_vm, OID_AUTO, zone_warnings, CTLFLAG_RW, &zone_warnings, 0,
     "Warn when UMA zones becomes full");
 
 /*
  * This routine checks to see whether or not it's safe to enable buckets.
  */
 
 static void
 bucket_enable(void)
 {
 	bucketdisable = vm_page_count_min();
 }
 
 /*
  * Initialize bucket_zones, the array of zones of buckets of various sizes.
  *
  * For each zone, calculate the memory required for each bucket, consisting
  * of the header and an array of pointers.  Initialize bucket_size[] to point
  * the range of appropriate bucket sizes at the zone.
  */
 static void
 bucket_init(void)
 {
 	struct uma_bucket_zone *ubz;
 	int i;
 	int j;
 
 	for (i = 0, j = 0; bucket_zones[j].ubz_entries != 0; j++) {
 		int size;
 
 		ubz = &bucket_zones[j];
 		size = roundup(sizeof(struct uma_bucket), sizeof(void *));
 		size += sizeof(void *) * ubz->ubz_entries;
 		ubz->ubz_zone = uma_zcreate(ubz->ubz_name, size,
 		    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
 		    UMA_ZFLAG_INTERNAL | UMA_ZFLAG_BUCKET);
 		for (; i <= ubz->ubz_entries; i += (1 << BUCKET_SHIFT))
 			bucket_size[i >> BUCKET_SHIFT] = j;
 	}
 }
 
 /*
  * Given a desired number of entries for a bucket, return the zone from which
  * to allocate the bucket.
  */
 static struct uma_bucket_zone *
 bucket_zone_lookup(int entries)
 {
 	int idx;
 
 	idx = howmany(entries, 1 << BUCKET_SHIFT);
 	return (&bucket_zones[bucket_size[idx]]);
 }
 
 static uma_bucket_t
 bucket_alloc(int entries, int bflags)
 {
 	struct uma_bucket_zone *ubz;
 	uma_bucket_t bucket;
 
 	/*
 	 * This is to stop us from allocating per cpu buckets while we're
 	 * running out of vm.boot_pages.  Otherwise, we would exhaust the
 	 * boot pages.  This also prevents us from allocating buckets in
 	 * low memory situations.
 	 */
 	if (bucketdisable)
 		return (NULL);
 
 	ubz = bucket_zone_lookup(entries);
 	bucket = zone_alloc_item(ubz->ubz_zone, NULL, bflags);
 	if (bucket) {
 #ifdef INVARIANTS
 		bzero(bucket->ub_bucket, sizeof(void *) * ubz->ubz_entries);
 #endif
 		bucket->ub_cnt = 0;
 		bucket->ub_entries = ubz->ubz_entries;
 	}
 
 	return (bucket);
 }
 
 static void
 bucket_free(uma_bucket_t bucket)
 {
 	struct uma_bucket_zone *ubz;
 
 	ubz = bucket_zone_lookup(bucket->ub_entries);
 	zone_free_item(ubz->ubz_zone, bucket, NULL, SKIP_NONE,
 	    ZFREE_STATFREE);
 }
 
 static void
 bucket_zone_drain(void)
 {
 	struct uma_bucket_zone *ubz;
 
 	for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++)
 		zone_drain(ubz->ubz_zone);
 }
 
 static void
 zone_log_warning(uma_zone_t zone)
 {
 	static const struct timeval warninterval = { 300, 0 };
 
 	if (!zone_warnings || zone->uz_warning == NULL)
 		return;
 
 	if (ratecheck(&zone->uz_ratecheck, &warninterval))
 		printf("[zone: %s] %s\n", zone->uz_name, zone->uz_warning);
 }
 
 static inline uma_keg_t
 zone_first_keg(uma_zone_t zone)
 {
 
 	return (LIST_FIRST(&zone->uz_kegs)->kl_keg);
 }
 
 static void
 zone_foreach_keg(uma_zone_t zone, void (*kegfn)(uma_keg_t))
 {
 	uma_klink_t klink;
 
 	LIST_FOREACH(klink, &zone->uz_kegs, kl_link)
 		kegfn(klink->kl_keg);
 }
 
 /*
  * Routine called by timeout which is used to fire off some time interval
  * based calculations.  (stats, hash size, etc.)
  *
  * Arguments:
  *	arg   Unused
  *
  * Returns:
  *	Nothing
  */
 static void
 uma_timeout(void *unused)
 {
 	bucket_enable();
 	zone_foreach(zone_timeout);
 
 	/* Reschedule this event */
 	callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL);
 }
 
 /*
  * Routine to perform timeout driven calculations.  This expands the
  * hashes and does per cpu statistics aggregation.
  *
  *  Returns nothing.
  */
 static void
 keg_timeout(uma_keg_t keg)
 {
 
 	KEG_LOCK(keg);
 	/*
 	 * Expand the keg hash table.
 	 *
 	 * This is done if the number of slabs is larger than the hash size.
 	 * What I'm trying to do here is completely reduce collisions.  This
 	 * may be a little aggressive.  Should I allow for two collisions max?
 	 */
 	if (keg->uk_flags & UMA_ZONE_HASH &&
 	    keg->uk_pages / keg->uk_ppera >= keg->uk_hash.uh_hashsize) {
 		struct uma_hash newhash;
 		struct uma_hash oldhash;
 		int ret;
 
 		/*
 		 * This is so involved because allocating and freeing
 		 * while the keg lock is held will lead to deadlock.
 		 * I have to do everything in stages and check for
 		 * races.
 		 */
 		newhash = keg->uk_hash;
 		KEG_UNLOCK(keg);
 		ret = hash_alloc(&newhash);
 		KEG_LOCK(keg);
 		if (ret) {
 			if (hash_expand(&keg->uk_hash, &newhash)) {
 				oldhash = keg->uk_hash;
 				keg->uk_hash = newhash;
 			} else
 				oldhash = newhash;
 
 			KEG_UNLOCK(keg);
 			hash_free(&oldhash);
 			KEG_LOCK(keg);
 		}
 	}
 	KEG_UNLOCK(keg);
 }
 
 static void
 zone_timeout(uma_zone_t zone)
 {
 
 	zone_foreach_keg(zone, &keg_timeout);
 }
 
 /*
  * Allocate and zero fill the next sized hash table from the appropriate
  * backing store.
  *
  * Arguments:
  *	hash  A new hash structure with the old hash size in uh_hashsize
  *
  * Returns:
  *	1 on sucess and 0 on failure.
  */
 static int
 hash_alloc(struct uma_hash *hash)
 {
 	int oldsize;
 	int alloc;
 
 	oldsize = hash->uh_hashsize;
 
 	/* We're just going to go to a power of two greater */
 	if (oldsize)  {
 		hash->uh_hashsize = oldsize * 2;
 		alloc = sizeof(hash->uh_slab_hash[0]) * hash->uh_hashsize;
 		hash->uh_slab_hash = (struct slabhead *)malloc(alloc,
 		    M_UMAHASH, M_NOWAIT);
 	} else {
 		alloc = sizeof(hash->uh_slab_hash[0]) * UMA_HASH_SIZE_INIT;
 		hash->uh_slab_hash = zone_alloc_item(hashzone, NULL,
 		    M_WAITOK);
 		hash->uh_hashsize = UMA_HASH_SIZE_INIT;
 	}
 	if (hash->uh_slab_hash) {
 		bzero(hash->uh_slab_hash, alloc);
 		hash->uh_hashmask = hash->uh_hashsize - 1;
 		return (1);
 	}
 
 	return (0);
 }
 
 /*
  * Expands the hash table for HASH zones.  This is done from zone_timeout
  * to reduce collisions.  This must not be done in the regular allocation
  * path, otherwise, we can recurse on the vm while allocating pages.
  *
  * Arguments:
  *	oldhash  The hash you want to expand
  *	newhash  The hash structure for the new table
  *
  * Returns:
  *	Nothing
  *
  * Discussion:
  */
 static int
 hash_expand(struct uma_hash *oldhash, struct uma_hash *newhash)
 {
 	uma_slab_t slab;
 	int hval;
 	int i;
 
 	if (!newhash->uh_slab_hash)
 		return (0);
 
 	if (oldhash->uh_hashsize >= newhash->uh_hashsize)
 		return (0);
 
 	/*
 	 * I need to investigate hash algorithms for resizing without a
 	 * full rehash.
 	 */
 
 	for (i = 0; i < oldhash->uh_hashsize; i++)
 		while (!SLIST_EMPTY(&oldhash->uh_slab_hash[i])) {
 			slab = SLIST_FIRST(&oldhash->uh_slab_hash[i]);
 			SLIST_REMOVE_HEAD(&oldhash->uh_slab_hash[i], us_hlink);
 			hval = UMA_HASH(newhash, slab->us_data);
 			SLIST_INSERT_HEAD(&newhash->uh_slab_hash[hval],
 			    slab, us_hlink);
 		}
 
 	return (1);
 }
 
 /*
  * Free the hash bucket to the appropriate backing store.
  *
  * Arguments:
  *	slab_hash  The hash bucket we're freeing
  *	hashsize   The number of entries in that hash bucket
  *
  * Returns:
  *	Nothing
  */
 static void
 hash_free(struct uma_hash *hash)
 {
 	if (hash->uh_slab_hash == NULL)
 		return;
 	if (hash->uh_hashsize == UMA_HASH_SIZE_INIT)
 		zone_free_item(hashzone,
 		    hash->uh_slab_hash, NULL, SKIP_NONE, ZFREE_STATFREE);
 	else
 		free(hash->uh_slab_hash, M_UMAHASH);
 }
 
 /*
  * Frees all outstanding items in a bucket
  *
  * Arguments:
  *	zone   The zone to free to, must be unlocked.
  *	bucket The free/alloc bucket with items, cpu queue must be locked.
  *
  * Returns:
  *	Nothing
  */
 
 static void
 bucket_drain(uma_zone_t zone, uma_bucket_t bucket)
 {
 	void *item;
 
 	if (bucket == NULL)
 		return;
 
 	while (bucket->ub_cnt > 0)  {
 		bucket->ub_cnt--;
 		item = bucket->ub_bucket[bucket->ub_cnt];
 #ifdef INVARIANTS
 		bucket->ub_bucket[bucket->ub_cnt] = NULL;
 		KASSERT(item != NULL,
 		    ("bucket_drain: botched ptr, item is NULL"));
 #endif
 		zone_free_item(zone, item, NULL, SKIP_DTOR, 0);
 	}
 }
 
 /*
  * Drains the per cpu caches for a zone.
  *
  * NOTE: This may only be called while the zone is being turn down, and not
  * during normal operation.  This is necessary in order that we do not have
  * to migrate CPUs to drain the per-CPU caches.
  *
  * Arguments:
  *	zone     The zone to drain, must be unlocked.
  *
  * Returns:
  *	Nothing
  */
 static void
 cache_drain(uma_zone_t zone)
 {
 	uma_cache_t cache;
 	int cpu;
 
 	/*
 	 * XXX: It is safe to not lock the per-CPU caches, because we're
 	 * tearing down the zone anyway.  I.e., there will be no further use
 	 * of the caches at this point.
 	 *
 	 * XXX: It would good to be able to assert that the zone is being
 	 * torn down to prevent improper use of cache_drain().
 	 *
 	 * XXX: We lock the zone before passing into bucket_cache_drain() as
 	 * it is used elsewhere.  Should the tear-down path be made special
 	 * there in some form?
 	 */
 	CPU_FOREACH(cpu) {
 		cache = &zone->uz_cpu[cpu];
 		bucket_drain(zone, cache->uc_allocbucket);
 		bucket_drain(zone, cache->uc_freebucket);
 		if (cache->uc_allocbucket != NULL)
 			bucket_free(cache->uc_allocbucket);
 		if (cache->uc_freebucket != NULL)
 			bucket_free(cache->uc_freebucket);
 		cache->uc_allocbucket = cache->uc_freebucket = NULL;
 	}
 	ZONE_LOCK(zone);
 	bucket_cache_drain(zone);
 	ZONE_UNLOCK(zone);
 }
 
 /*
  * Drain the cached buckets from a zone.  Expects a locked zone on entry.
  */
 static void
 bucket_cache_drain(uma_zone_t zone)
 {
 	uma_bucket_t bucket;
 
 	/*
 	 * Drain the bucket queues and free the buckets, we just keep two per
 	 * cpu (alloc/free).
 	 */
 	while ((bucket = LIST_FIRST(&zone->uz_full_bucket)) != NULL) {
 		LIST_REMOVE(bucket, ub_link);
 		ZONE_UNLOCK(zone);
 		bucket_drain(zone, bucket);
 		bucket_free(bucket);
 		ZONE_LOCK(zone);
 	}
 
 	/* Now we do the free queue.. */
 	while ((bucket = LIST_FIRST(&zone->uz_free_bucket)) != NULL) {
 		LIST_REMOVE(bucket, ub_link);
 		bucket_free(bucket);
 	}
 }
 
 /*
  * Frees pages from a keg back to the system.  This is done on demand from
  * the pageout daemon.
  *
  * Returns nothing.
  */
 static void
 keg_drain(uma_keg_t keg)
 {
 	struct slabhead freeslabs = { 0 };
 	uma_slab_t slab;
 	uma_slab_t n;
 	u_int8_t flags;
 	u_int8_t *mem;
 	int i;
 
 	/*
 	 * We don't want to take pages from statically allocated kegs at this
 	 * time
 	 */
 	if (keg->uk_flags & UMA_ZONE_NOFREE || keg->uk_freef == NULL)
 		return;
 
 #ifdef UMA_DEBUG
 	printf("%s free items: %u\n", keg->uk_name, keg->uk_free);
 #endif
 	KEG_LOCK(keg);
 	if (keg->uk_free == 0)
 		goto finished;
 
 	slab = LIST_FIRST(&keg->uk_free_slab);
 	while (slab) {
 		n = LIST_NEXT(slab, us_link);
 
 		/* We have no where to free these to */
 		if (slab->us_flags & UMA_SLAB_BOOT) {
 			slab = n;
 			continue;
 		}
 
 		LIST_REMOVE(slab, us_link);
 		keg->uk_pages -= keg->uk_ppera;
 		keg->uk_free -= keg->uk_ipers;
 
 		if (keg->uk_flags & UMA_ZONE_HASH)
 			UMA_HASH_REMOVE(&keg->uk_hash, slab, slab->us_data);
 
 		SLIST_INSERT_HEAD(&freeslabs, slab, us_hlink);
 
 		slab = n;
 	}
 finished:
 	KEG_UNLOCK(keg);
 
 	while ((slab = SLIST_FIRST(&freeslabs)) != NULL) {
 		SLIST_REMOVE(&freeslabs, slab, uma_slab, us_hlink);
 		if (keg->uk_fini)
 			for (i = 0; i < keg->uk_ipers; i++)
 				keg->uk_fini(
 				    slab->us_data + (keg->uk_rsize * i),
 				    keg->uk_size);
 		flags = slab->us_flags;
 		mem = slab->us_data;
 
 		if (keg->uk_flags & UMA_ZONE_VTOSLAB) {
 			vm_object_t obj;
 
 			if (flags & UMA_SLAB_KMEM)
 				obj = kmem_object;
 			else if (flags & UMA_SLAB_KERNEL)
 				obj = kernel_object;
 			else
 				obj = NULL;
 			for (i = 0; i < keg->uk_ppera; i++)
 				vsetobj((vm_offset_t)mem + (i * PAGE_SIZE),
 				    obj);
 		}
 		if (keg->uk_flags & UMA_ZONE_OFFPAGE)
 			zone_free_item(keg->uk_slabzone, slab, NULL,
 			    SKIP_NONE, ZFREE_STATFREE);
 #ifdef UMA_DEBUG
 		printf("%s: Returning %d bytes.\n",
 		    keg->uk_name, UMA_SLAB_SIZE * keg->uk_ppera);
 #endif
 		keg->uk_freef(mem, UMA_SLAB_SIZE * keg->uk_ppera, flags);
 	}
 }
 
 static void
 zone_drain_wait(uma_zone_t zone, int waitok)
 {
 
 	/*
 	 * Set draining to interlock with zone_dtor() so we can release our
 	 * locks as we go.  Only dtor() should do a WAITOK call since it
 	 * is the only call that knows the structure will still be available
 	 * when it wakes up.
 	 */
 	ZONE_LOCK(zone);
 	while (zone->uz_flags & UMA_ZFLAG_DRAINING) {
 		if (waitok == M_NOWAIT)
 			goto out;
 		mtx_unlock(&uma_mtx);
 		msleep(zone, zone->uz_lock, PVM, "zonedrain", 1);
 		mtx_lock(&uma_mtx);
 	}
 	zone->uz_flags |= UMA_ZFLAG_DRAINING;
 	bucket_cache_drain(zone);
 	ZONE_UNLOCK(zone);
 	/*
 	 * The DRAINING flag protects us from being freed while
 	 * we're running.  Normally the uma_mtx would protect us but we
 	 * must be able to release and acquire the right lock for each keg.
 	 */
 	zone_foreach_keg(zone, &keg_drain);
 	ZONE_LOCK(zone);
 	zone->uz_flags &= ~UMA_ZFLAG_DRAINING;
 	wakeup(zone);
 out:
 	ZONE_UNLOCK(zone);
 }
 
 void
 zone_drain(uma_zone_t zone)
 {
 
 	zone_drain_wait(zone, M_NOWAIT);
 }
 
 /*
  * Allocate a new slab for a keg.  This does not insert the slab onto a list.
  *
  * Arguments:
  *	wait  Shall we wait?
  *
  * Returns:
  *	The slab that was allocated or NULL if there is no memory and the
  *	caller specified M_NOWAIT.
  */
 static uma_slab_t
 keg_alloc_slab(uma_keg_t keg, uma_zone_t zone, int wait)
 {
 	uma_slabrefcnt_t slabref;
 	uma_alloc allocf;
 	uma_slab_t slab;
 	u_int8_t *mem;
 	u_int8_t flags;
 	int i;
 
 	mtx_assert(&keg->uk_lock, MA_OWNED);
 	slab = NULL;
 
 #ifdef UMA_DEBUG
 	printf("slab_zalloc:  Allocating a new slab for %s\n", keg->uk_name);
 #endif
 	allocf = keg->uk_allocf;
 	KEG_UNLOCK(keg);
 
 	if (keg->uk_flags & UMA_ZONE_OFFPAGE) {
 		slab = zone_alloc_item(keg->uk_slabzone, NULL, wait);
 		if (slab == NULL) {
 			KEG_LOCK(keg);
 			return NULL;
 		}
 	}
 
 	/*
 	 * This reproduces the old vm_zone behavior of zero filling pages the
 	 * first time they are added to a zone.
 	 *
 	 * Malloced items are zeroed in uma_zalloc.
 	 */
 
 	if ((keg->uk_flags & UMA_ZONE_MALLOC) == 0)
 		wait |= M_ZERO;
 	else
 		wait &= ~M_ZERO;
 
 	if (keg->uk_flags & UMA_ZONE_NODUMP)
 		wait |= M_NODUMP;
 
 	/* zone is passed for legacy reasons. */
 	mem = allocf(zone, keg->uk_ppera * UMA_SLAB_SIZE, &flags, wait);
 	if (mem == NULL) {
 		if (keg->uk_flags & UMA_ZONE_OFFPAGE)
 			zone_free_item(keg->uk_slabzone, slab, NULL,
 			    SKIP_NONE, ZFREE_STATFREE);
 		KEG_LOCK(keg);
 		return (NULL);
 	}
 
 	/* Point the slab into the allocated memory */
 	if (!(keg->uk_flags & UMA_ZONE_OFFPAGE))
 		slab = (uma_slab_t )(mem + keg->uk_pgoff);
 
 	if (keg->uk_flags & UMA_ZONE_VTOSLAB)
 		for (i = 0; i < keg->uk_ppera; i++)
 			vsetslab((vm_offset_t)mem + (i * PAGE_SIZE), slab);
 
 	slab->us_keg = keg;
 	slab->us_data = mem;
 	slab->us_freecount = keg->uk_ipers;
 	slab->us_firstfree = 0;
 	slab->us_flags = flags;
 
 	if (keg->uk_flags & UMA_ZONE_REFCNT) {
 		slabref = (uma_slabrefcnt_t)slab;
 		for (i = 0; i < keg->uk_ipers; i++) {
 			slabref->us_freelist[i].us_refcnt = 0;
 			slabref->us_freelist[i].us_item = i+1;
 		}
 	} else {
 		for (i = 0; i < keg->uk_ipers; i++)
 			slab->us_freelist[i].us_item = i+1;
 	}
 
 	if (keg->uk_init != NULL) {
 		for (i = 0; i < keg->uk_ipers; i++)
 			if (keg->uk_init(slab->us_data + (keg->uk_rsize * i),
 			    keg->uk_size, wait) != 0)
 				break;
 		if (i != keg->uk_ipers) {
 			if (keg->uk_fini != NULL) {
 				for (i--; i > -1; i--)
 					keg->uk_fini(slab->us_data +
 					    (keg->uk_rsize * i),
 					    keg->uk_size);
 			}
 			if (keg->uk_flags & UMA_ZONE_VTOSLAB) {
 				vm_object_t obj;
 
 				if (flags & UMA_SLAB_KMEM)
 					obj = kmem_object;
 				else if (flags & UMA_SLAB_KERNEL)
 					obj = kernel_object;
 				else
 					obj = NULL;
 				for (i = 0; i < keg->uk_ppera; i++)
 					vsetobj((vm_offset_t)mem +
 					    (i * PAGE_SIZE), obj);
 			}
 			if (keg->uk_flags & UMA_ZONE_OFFPAGE)
 				zone_free_item(keg->uk_slabzone, slab,
 				    NULL, SKIP_NONE, ZFREE_STATFREE);
 			keg->uk_freef(mem, UMA_SLAB_SIZE * keg->uk_ppera,
 			    flags);
 			KEG_LOCK(keg);
 			return (NULL);
 		}
 	}
 	KEG_LOCK(keg);
 
 	if (keg->uk_flags & UMA_ZONE_HASH)
 		UMA_HASH_INSERT(&keg->uk_hash, slab, mem);
 
 	keg->uk_pages += keg->uk_ppera;
 	keg->uk_free += keg->uk_ipers;
 
 	return (slab);
 }
 
 /*
  * This function is intended to be used early on in place of page_alloc() so
  * that we may use the boot time page cache to satisfy allocations before
  * the VM is ready.
  */
 static void *
 startup_alloc(uma_zone_t zone, int bytes, u_int8_t *pflag, int wait)
 {
 	uma_keg_t keg;
 	uma_slab_t tmps;
 	int pages, check_pages;
 
 	keg = zone_first_keg(zone);
 	pages = howmany(bytes, PAGE_SIZE);
 	check_pages = pages - 1;
 	KASSERT(pages > 0, ("startup_alloc can't reserve 0 pages\n"));
 
 	/*
 	 * Check our small startup cache to see if it has pages remaining.
 	 */
 	mtx_lock(&uma_boot_pages_mtx);
 
 	/* First check if we have enough room. */
 	tmps = LIST_FIRST(&uma_boot_pages);
 	while (tmps != NULL && check_pages-- > 0)
 		tmps = LIST_NEXT(tmps, us_link);
 	if (tmps != NULL) {
 		/*
 		 * It's ok to lose tmps references.  The last one will
 		 * have tmps->us_data pointing to the start address of
 		 * "pages" contiguous pages of memory.
 		 */
 		while (pages-- > 0) {
 			tmps = LIST_FIRST(&uma_boot_pages);
 			LIST_REMOVE(tmps, us_link);
 		}
 		mtx_unlock(&uma_boot_pages_mtx);
 		*pflag = tmps->us_flags;
 		return (tmps->us_data);
 	}
 	mtx_unlock(&uma_boot_pages_mtx);
 	if (booted < UMA_STARTUP2)
 		panic("UMA: Increase vm.boot_pages");
 	/*
 	 * Now that we've booted reset these users to their real allocator.
 	 */
 #ifdef UMA_MD_SMALL_ALLOC
 	keg->uk_allocf = (keg->uk_ppera > 1) ? page_alloc : uma_small_alloc;
 #else
 	keg->uk_allocf = page_alloc;
 #endif
 	return keg->uk_allocf(zone, bytes, pflag, wait);
 }
 
 /*
  * Allocates a number of pages from the system
  *
  * Arguments:
  *	bytes  The number of bytes requested
  *	wait  Shall we wait?
  *
  * Returns:
  *	A pointer to the alloced memory or possibly
  *	NULL if M_NOWAIT is set.
  */
 static void *
 page_alloc(uma_zone_t zone, int bytes, u_int8_t *pflag, int wait)
 {
 	void *p;	/* Returned page */
 
 	*pflag = UMA_SLAB_KMEM;
 	p = (void *) kmem_malloc(kmem_map, bytes, wait);
 
 	return (p);
 }
 
 /*
  * Allocates a number of pages from within an object
  *
  * Arguments:
  *	bytes  The number of bytes requested
  *	wait   Shall we wait?
  *
  * Returns:
  *	A pointer to the alloced memory or possibly
  *	NULL if M_NOWAIT is set.
  */
 static void *
-obj_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
+noobj_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
 {
-	vm_object_t object;
+	TAILQ_HEAD(, vm_page) alloctail;
+	u_long npages;
 	vm_offset_t retkva, zkva;
-	vm_page_t p;
-	int pages, startpages;
+	vm_page_t p, p_next;
 	uma_keg_t keg;
 
+	TAILQ_INIT(&alloctail);
 	keg = zone_first_keg(zone);
-	object = keg->uk_obj;
-	retkva = 0;
 
-	/*
-	 * This looks a little weird since we're getting one page at a time.
-	 */
-	VM_OBJECT_LOCK(object);
-	p = TAILQ_LAST(&object->memq, pglist);
-	pages = p != NULL ? p->pindex + 1 : 0;
-	startpages = pages;
-	zkva = keg->uk_kva + pages * PAGE_SIZE;
-	for (; bytes > 0; bytes -= PAGE_SIZE) {
-		p = vm_page_alloc(object, pages,
-		    VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED);
-		if (p == NULL) {
-			if (pages != startpages)
-				pmap_qremove(retkva, pages - startpages);
-			while (pages != startpages) {
-				pages--;
-				p = TAILQ_LAST(&object->memq, pglist);
-				vm_page_unwire(p, 0);
-				vm_page_free(p);
-			}
-			retkva = 0;
-			goto done;
+	npages = howmany(bytes, PAGE_SIZE);
+	while (npages > 0) {
+		p = vm_page_alloc(NULL, 0, VM_ALLOC_INTERRUPT |
+		    VM_ALLOC_WIRED | VM_ALLOC_NOOBJ);
+		if (p != NULL) {
+			/*
+			 * Since the page does not belong to an object, its
+			 * listq is unused.
+			 */
+			TAILQ_INSERT_TAIL(&alloctail, p, listq);
+			npages--;
+			continue;
 		}
+		if (wait & M_WAITOK) {
+			VM_WAIT;
+			continue;
+		}
+
+		/*
+		 * Page allocation failed, free intermediate pages and
+		 * exit.
+		 */
+		TAILQ_FOREACH_SAFE(p, &alloctail, listq, p_next) {
+			vm_page_unwire(p, 0);
+			vm_page_free(p); 
+		}
+		return (NULL);
+	}
+	*flags = UMA_SLAB_PRIV;
+	zkva = keg->uk_kva +
+	    atomic_fetchadd_long(&keg->uk_offset, round_page(bytes));
+	retkva = zkva;
+	TAILQ_FOREACH(p, &alloctail, listq) {
 		pmap_qenter(zkva, &p, 1);
-		if (retkva == 0)
-			retkva = zkva;
 		zkva += PAGE_SIZE;
-		pages += 1;
 	}
-done:
-	VM_OBJECT_UNLOCK(object);
-	*flags = UMA_SLAB_PRIV;
 
 	return ((void *)retkva);
 }
 
 /*
  * Frees a number of pages to the system
  *
  * Arguments:
  *	mem   A pointer to the memory to be freed
  *	size  The size of the memory being freed
  *	flags The original p->us_flags field
  *
  * Returns:
  *	Nothing
  */
 static void
 page_free(void *mem, int size, u_int8_t flags)
 {
 	vm_map_t map;
 
 	if (flags & UMA_SLAB_KMEM)
 		map = kmem_map;
 	else if (flags & UMA_SLAB_KERNEL)
 		map = kernel_map;
 	else
 		panic("UMA: page_free used with invalid flags %d", flags);
 
 	kmem_free(map, (vm_offset_t)mem, size);
 }
 
 /*
  * Zero fill initializer
  *
  * Arguments/Returns follow uma_init specifications
  */
 static int
 zero_init(void *mem, int size, int flags)
 {
 	bzero(mem, size);
 	return (0);
 }
 
 /*
  * Finish creating a small uma keg.  This calculates ipers, and the keg size.
  *
  * Arguments
  *	keg  The zone we should initialize
  *
  * Returns
  *	Nothing
  */
 static void
 keg_small_init(uma_keg_t keg)
 {
 	u_int rsize;
 	u_int memused;
 	u_int wastedspace;
 	u_int shsize;
 
 	KASSERT(keg != NULL, ("Keg is null in keg_small_init"));
 	rsize = keg->uk_size;
 
 	if (rsize < UMA_SMALLEST_UNIT)
 		rsize = UMA_SMALLEST_UNIT;
 	if (rsize & keg->uk_align)
 		rsize = (rsize & ~keg->uk_align) + (keg->uk_align + 1);
 
 	keg->uk_rsize = rsize;
 	keg->uk_ppera = 1;
 
 	if (keg->uk_flags & UMA_ZONE_OFFPAGE) {
 		shsize = 0;
 	} else if (keg->uk_flags & UMA_ZONE_REFCNT) {
 		rsize += UMA_FRITMREF_SZ;	/* linkage & refcnt */
 		shsize = sizeof(struct uma_slab_refcnt);
 	} else {
 		rsize += UMA_FRITM_SZ;	/* Account for linkage */
 		shsize = sizeof(struct uma_slab);
 	}
 
 	keg->uk_ipers = (UMA_SLAB_SIZE - shsize) / rsize;
 	KASSERT(keg->uk_ipers != 0, ("keg_small_init: ipers is 0"));
 	memused = keg->uk_ipers * rsize + shsize;
 	wastedspace = UMA_SLAB_SIZE - memused;
 
 	/*
 	 * We can't do OFFPAGE if we're internal or if we've been
 	 * asked to not go to the VM for buckets.  If we do this we
 	 * may end up going to the VM (kmem_map) for slabs which we
 	 * do not want to do if we're UMA_ZFLAG_CACHEONLY as a
 	 * result of UMA_ZONE_VM, which clearly forbids it.
 	 */
 	if ((keg->uk_flags & UMA_ZFLAG_INTERNAL) ||
 	    (keg->uk_flags & UMA_ZFLAG_CACHEONLY))
 		return;
 
 	if ((wastedspace >= UMA_MAX_WASTE) &&
 	    (keg->uk_ipers < (UMA_SLAB_SIZE / keg->uk_rsize))) {
 		keg->uk_ipers = UMA_SLAB_SIZE / keg->uk_rsize;
 		KASSERT(keg->uk_ipers <= 255,
 		    ("keg_small_init: keg->uk_ipers too high!"));
 #ifdef UMA_DEBUG
 		printf("UMA decided we need offpage slab headers for "
 		    "keg: %s, calculated wastedspace = %d, "
 		    "maximum wasted space allowed = %d, "
 		    "calculated ipers = %d, "
 		    "new wasted space = %d\n", keg->uk_name, wastedspace,
 		    UMA_MAX_WASTE, keg->uk_ipers,
 		    UMA_SLAB_SIZE - keg->uk_ipers * keg->uk_rsize);
 #endif
 		keg->uk_flags |= UMA_ZONE_OFFPAGE;
 		if ((keg->uk_flags & UMA_ZONE_VTOSLAB) == 0)
 			keg->uk_flags |= UMA_ZONE_HASH;
 	}
 }
 
 /*
  * Finish creating a large (> UMA_SLAB_SIZE) uma kegs.  Just give in and do
  * OFFPAGE for now.  When I can allow for more dynamic slab sizes this will be
  * more complicated.
  *
  * Arguments
  *	keg  The keg we should initialize
  *
  * Returns
  *	Nothing
  */
 static void
 keg_large_init(uma_keg_t keg)
 {
 	int pages;
 
 	KASSERT(keg != NULL, ("Keg is null in keg_large_init"));
 	KASSERT((keg->uk_flags & UMA_ZFLAG_CACHEONLY) == 0,
 	    ("keg_large_init: Cannot large-init a UMA_ZFLAG_CACHEONLY keg"));
 
 	pages = keg->uk_size / UMA_SLAB_SIZE;
 
 	/* Account for remainder */
 	if ((pages * UMA_SLAB_SIZE) < keg->uk_size)
 		pages++;
 
 	keg->uk_ppera = pages;
 	keg->uk_ipers = 1;
 	keg->uk_rsize = keg->uk_size;
 
 	/* We can't do OFFPAGE if we're internal, bail out here. */
 	if (keg->uk_flags & UMA_ZFLAG_INTERNAL)
 		return;
 
 	keg->uk_flags |= UMA_ZONE_OFFPAGE;
 	if ((keg->uk_flags & UMA_ZONE_VTOSLAB) == 0)
 		keg->uk_flags |= UMA_ZONE_HASH;
 }
 
 static void
 keg_cachespread_init(uma_keg_t keg)
 {
 	int alignsize;
 	int trailer;
 	int pages;
 	int rsize;
 
 	alignsize = keg->uk_align + 1;
 	rsize = keg->uk_size;
 	/*
 	 * We want one item to start on every align boundary in a page.  To
 	 * do this we will span pages.  We will also extend the item by the
 	 * size of align if it is an even multiple of align.  Otherwise, it
 	 * would fall on the same boundary every time.
 	 */
 	if (rsize & keg->uk_align)
 		rsize = (rsize & ~keg->uk_align) + alignsize;
 	if ((rsize & alignsize) == 0)
 		rsize += alignsize;
 	trailer = rsize - keg->uk_size;
 	pages = (rsize * (PAGE_SIZE / alignsize)) / PAGE_SIZE;
 	pages = MIN(pages, (128 * 1024) / PAGE_SIZE);
 	keg->uk_rsize = rsize;
 	keg->uk_ppera = pages;
 	keg->uk_ipers = ((pages * PAGE_SIZE) + trailer) / rsize;
 	keg->uk_flags |= UMA_ZONE_OFFPAGE | UMA_ZONE_VTOSLAB;
 	KASSERT(keg->uk_ipers <= uma_max_ipers,
 	    ("%s: keg->uk_ipers too high(%d) increase max_ipers", __func__,
 	    keg->uk_ipers));
 }
 
 /*
  * Keg header ctor.  This initializes all fields, locks, etc.  And inserts
  * the keg onto the global keg list.
  *
  * Arguments/Returns follow uma_ctor specifications
  *	udata  Actually uma_kctor_args
  */
 static int
 keg_ctor(void *mem, int size, void *udata, int flags)
 {
 	struct uma_kctor_args *arg = udata;
 	uma_keg_t keg = mem;
 	uma_zone_t zone;
 
 	bzero(keg, size);
 	keg->uk_size = arg->size;
 	keg->uk_init = arg->uminit;
 	keg->uk_fini = arg->fini;
 	keg->uk_align = arg->align;
 	keg->uk_free = 0;
 	keg->uk_pages = 0;
 	keg->uk_flags = arg->flags;
 	keg->uk_allocf = page_alloc;
 	keg->uk_freef = page_free;
 	keg->uk_recurse = 0;
 	keg->uk_slabzone = NULL;
 
 	/*
 	 * The master zone is passed to us at keg-creation time.
 	 */
 	zone = arg->zone;
 	keg->uk_name = zone->uz_name;
 
 	if (arg->flags & UMA_ZONE_VM)
 		keg->uk_flags |= UMA_ZFLAG_CACHEONLY;
 
 	if (arg->flags & UMA_ZONE_ZINIT)
 		keg->uk_init = zero_init;
 
 	if (arg->flags & UMA_ZONE_REFCNT || arg->flags & UMA_ZONE_MALLOC)
 		keg->uk_flags |= UMA_ZONE_VTOSLAB;
 
 	/*
 	 * The +UMA_FRITM_SZ added to uk_size is to account for the
 	 * linkage that is added to the size in keg_small_init().  If
 	 * we don't account for this here then we may end up in
 	 * keg_small_init() with a calculated 'ipers' of 0.
 	 */
 	if (keg->uk_flags & UMA_ZONE_REFCNT) {
 		if (keg->uk_flags & UMA_ZONE_CACHESPREAD)
 			keg_cachespread_init(keg);
 		else if ((keg->uk_size+UMA_FRITMREF_SZ) >
 		    (UMA_SLAB_SIZE - sizeof(struct uma_slab_refcnt)))
 			keg_large_init(keg);
 		else
 			keg_small_init(keg);
 	} else {
 		if (keg->uk_flags & UMA_ZONE_CACHESPREAD)
 			keg_cachespread_init(keg);
 		else if ((keg->uk_size+UMA_FRITM_SZ) >
 		    (UMA_SLAB_SIZE - sizeof(struct uma_slab)))
 			keg_large_init(keg);
 		else
 			keg_small_init(keg);
 	}
 
 	if (keg->uk_flags & UMA_ZONE_OFFPAGE) {
 		if (keg->uk_flags & UMA_ZONE_REFCNT)
 			keg->uk_slabzone = slabrefzone;
 		else
 			keg->uk_slabzone = slabzone;
 	}
 
 	/*
 	 * If we haven't booted yet we need allocations to go through the
 	 * startup cache until the vm is ready.
 	 */
 	if (keg->uk_ppera == 1) {
 #ifdef UMA_MD_SMALL_ALLOC
 		keg->uk_allocf = uma_small_alloc;
 		keg->uk_freef = uma_small_free;
 
 		if (booted < UMA_STARTUP)
 			keg->uk_allocf = startup_alloc;
 #else
 		if (booted < UMA_STARTUP2)
 			keg->uk_allocf = startup_alloc;
 #endif
 	} else if (booted < UMA_STARTUP2 &&
 	    (keg->uk_flags & UMA_ZFLAG_INTERNAL))
 		keg->uk_allocf = startup_alloc;
 
 	/*
 	 * Initialize keg's lock (shared among zones).
 	 */
 	if (arg->flags & UMA_ZONE_MTXCLASS)
 		KEG_LOCK_INIT(keg, 1);
 	else
 		KEG_LOCK_INIT(keg, 0);
 
 	/*
 	 * If we're putting the slab header in the actual page we need to
 	 * figure out where in each page it goes.  This calculates a right
 	 * justified offset into the memory on an ALIGN_PTR boundary.
 	 */
 	if (!(keg->uk_flags & UMA_ZONE_OFFPAGE)) {
 		u_int totsize;
 
 		/* Size of the slab struct and free list */
 		if (keg->uk_flags & UMA_ZONE_REFCNT)
 			totsize = sizeof(struct uma_slab_refcnt) +
 			    keg->uk_ipers * UMA_FRITMREF_SZ;
 		else
 			totsize = sizeof(struct uma_slab) +
 			    keg->uk_ipers * UMA_FRITM_SZ;
 
 		if (totsize & UMA_ALIGN_PTR)
 			totsize = (totsize & ~UMA_ALIGN_PTR) +
 			    (UMA_ALIGN_PTR + 1);
 		keg->uk_pgoff = (UMA_SLAB_SIZE * keg->uk_ppera) - totsize;
 
 		if (keg->uk_flags & UMA_ZONE_REFCNT)
 			totsize = keg->uk_pgoff + sizeof(struct uma_slab_refcnt)
 			    + keg->uk_ipers * UMA_FRITMREF_SZ;
 		else
 			totsize = keg->uk_pgoff + sizeof(struct uma_slab)
 			    + keg->uk_ipers * UMA_FRITM_SZ;
 
 		/*
 		 * The only way the following is possible is if with our
 		 * UMA_ALIGN_PTR adjustments we are now bigger than
 		 * UMA_SLAB_SIZE.  I haven't checked whether this is
 		 * mathematically possible for all cases, so we make
 		 * sure here anyway.
 		 */
 		if (totsize > UMA_SLAB_SIZE * keg->uk_ppera) {
 			printf("zone %s ipers %d rsize %d size %d\n",
 			    zone->uz_name, keg->uk_ipers, keg->uk_rsize,
 			    keg->uk_size);
 			panic("UMA slab won't fit.");
 		}
 	}
 
 	if (keg->uk_flags & UMA_ZONE_HASH)
 		hash_alloc(&keg->uk_hash);
 
 #ifdef UMA_DEBUG
 	printf("UMA: %s(%p) size %d(%d) flags %#x ipers %d ppera %d out %d free %d\n",
 	    zone->uz_name, zone, keg->uk_size, keg->uk_rsize, keg->uk_flags,
 	    keg->uk_ipers, keg->uk_ppera,
 	    (keg->uk_ipers * keg->uk_pages) - keg->uk_free, keg->uk_free);
 #endif
 
 	LIST_INSERT_HEAD(&keg->uk_zones, zone, uz_link);
 
 	mtx_lock(&uma_mtx);
 	LIST_INSERT_HEAD(&uma_kegs, keg, uk_link);
 	mtx_unlock(&uma_mtx);
 	return (0);
 }
 
 /*
  * Zone header ctor.  This initializes all fields, locks, etc.
  *
  * Arguments/Returns follow uma_ctor specifications
  *	udata  Actually uma_zctor_args
  */
 static int
 zone_ctor(void *mem, int size, void *udata, int flags)
 {
 	struct uma_zctor_args *arg = udata;
 	uma_zone_t zone = mem;
 	uma_zone_t z;
 	uma_keg_t keg;
 
 	bzero(zone, size);
 	zone->uz_name = arg->name;
 	zone->uz_ctor = arg->ctor;
 	zone->uz_dtor = arg->dtor;
 	zone->uz_slab = zone_fetch_slab;
 	zone->uz_init = NULL;
 	zone->uz_fini = NULL;
 	zone->uz_allocs = 0;
 	zone->uz_frees = 0;
 	zone->uz_fails = 0;
 	zone->uz_sleeps = 0;
 	zone->uz_fills = zone->uz_count = 0;
 	zone->uz_flags = 0;
 	zone->uz_warning = NULL;
 	timevalclear(&zone->uz_ratecheck);
 	keg = arg->keg;
 
 	if (arg->flags & UMA_ZONE_SECONDARY) {
 		KASSERT(arg->keg != NULL, ("Secondary zone on zero'd keg"));
 		zone->uz_init = arg->uminit;
 		zone->uz_fini = arg->fini;
 		zone->uz_lock = &keg->uk_lock;
 		zone->uz_flags |= UMA_ZONE_SECONDARY;
 		mtx_lock(&uma_mtx);
 		ZONE_LOCK(zone);
 		LIST_FOREACH(z, &keg->uk_zones, uz_link) {
 			if (LIST_NEXT(z, uz_link) == NULL) {
 				LIST_INSERT_AFTER(z, zone, uz_link);
 				break;
 			}
 		}
 		ZONE_UNLOCK(zone);
 		mtx_unlock(&uma_mtx);
 	} else if (keg == NULL) {
 		if ((keg = uma_kcreate(zone, arg->size, arg->uminit, arg->fini,
 		    arg->align, arg->flags)) == NULL)
 			return (ENOMEM);
 	} else {
 		struct uma_kctor_args karg;
 		int error;
 
 		/* We should only be here from uma_startup() */
 		karg.size = arg->size;
 		karg.uminit = arg->uminit;
 		karg.fini = arg->fini;
 		karg.align = arg->align;
 		karg.flags = arg->flags;
 		karg.zone = zone;
 		error = keg_ctor(arg->keg, sizeof(struct uma_keg), &karg,
 		    flags);
 		if (error)
 			return (error);
 	}
 	/*
 	 * Link in the first keg.
 	 */
 	zone->uz_klink.kl_keg = keg;
 	LIST_INSERT_HEAD(&zone->uz_kegs, &zone->uz_klink, kl_link);
 	zone->uz_lock = &keg->uk_lock;
 	zone->uz_size = keg->uk_size;
 	zone->uz_flags |= (keg->uk_flags &
 	    (UMA_ZONE_INHERIT | UMA_ZFLAG_INHERIT));
 
 	/*
 	 * Some internal zones don't have room allocated for the per cpu
 	 * caches.  If we're internal, bail out here.
 	 */
 	if (keg->uk_flags & UMA_ZFLAG_INTERNAL) {
 		KASSERT((zone->uz_flags & UMA_ZONE_SECONDARY) == 0,
 		    ("Secondary zone requested UMA_ZFLAG_INTERNAL"));
 		return (0);
 	}
 
 	if (keg->uk_flags & UMA_ZONE_MAXBUCKET)
 		zone->uz_count = BUCKET_MAX;
 	else if (keg->uk_ipers <= BUCKET_MAX)
 		zone->uz_count = keg->uk_ipers;
 	else
 		zone->uz_count = BUCKET_MAX;
 	return (0);
 }
 
 /*
  * Keg header dtor.  This frees all data, destroys locks, frees the hash
  * table and removes the keg from the global list.
  *
  * Arguments/Returns follow uma_dtor specifications
  *	udata  unused
  */
 static void
 keg_dtor(void *arg, int size, void *udata)
 {
 	uma_keg_t keg;
 
 	keg = (uma_keg_t)arg;
 	KEG_LOCK(keg);
 	if (keg->uk_free != 0) {
 		printf("Freed UMA keg was not empty (%d items). "
 		    " Lost %d pages of memory.\n",
 		    keg->uk_free, keg->uk_pages);
 	}
 	KEG_UNLOCK(keg);
 
 	hash_free(&keg->uk_hash);
 
 	KEG_LOCK_FINI(keg);
 }
 
 /*
  * Zone header dtor.
  *
  * Arguments/Returns follow uma_dtor specifications
  *	udata  unused
  */
 static void
 zone_dtor(void *arg, int size, void *udata)
 {
 	uma_klink_t klink;
 	uma_zone_t zone;
 	uma_keg_t keg;
 
 	zone = (uma_zone_t)arg;
 	keg = zone_first_keg(zone);
 
 	if (!(zone->uz_flags & UMA_ZFLAG_INTERNAL))
 		cache_drain(zone);
 
 	mtx_lock(&uma_mtx);
 	LIST_REMOVE(zone, uz_link);
 	mtx_unlock(&uma_mtx);
 	/*
 	 * XXX there are some races here where
 	 * the zone can be drained but zone lock
 	 * released and then refilled before we
 	 * remove it... we dont care for now
 	 */
 	zone_drain_wait(zone, M_WAITOK);
 	/*
 	 * Unlink all of our kegs.
 	 */
 	while ((klink = LIST_FIRST(&zone->uz_kegs)) != NULL) {
 		klink->kl_keg = NULL;
 		LIST_REMOVE(klink, kl_link);
 		if (klink == &zone->uz_klink)
 			continue;
 		free(klink, M_TEMP);
 	}
 	/*
 	 * We only destroy kegs from non secondary zones.
 	 */
 	if ((zone->uz_flags & UMA_ZONE_SECONDARY) == 0)  {
 		mtx_lock(&uma_mtx);
 		LIST_REMOVE(keg, uk_link);
 		mtx_unlock(&uma_mtx);
 		zone_free_item(kegs, keg, NULL, SKIP_NONE,
 		    ZFREE_STATFREE);
 	}
 }
 
 /*
  * Traverses every zone in the system and calls a callback
  *
  * Arguments:
  *	zfunc  A pointer to a function which accepts a zone
  *		as an argument.
  *
  * Returns:
  *	Nothing
  */
 static void
 zone_foreach(void (*zfunc)(uma_zone_t))
 {
 	uma_keg_t keg;
 	uma_zone_t zone;
 
 	mtx_lock(&uma_mtx);
 	LIST_FOREACH(keg, &uma_kegs, uk_link) {
 		LIST_FOREACH(zone, &keg->uk_zones, uz_link)
 			zfunc(zone);
 	}
 	mtx_unlock(&uma_mtx);
 }
 
 /* Public functions */
 /* See uma.h */
 void
 uma_startup(void *bootmem, int boot_pages)
 {
 	struct uma_zctor_args args;
 	uma_slab_t slab;
 	u_int slabsize;
 	u_int objsize, totsize, wsize;
 	int i;
 
 #ifdef UMA_DEBUG
 	printf("Creating uma keg headers zone and keg.\n");
 #endif
 	mtx_init(&uma_mtx, "UMA lock", NULL, MTX_DEF);
 
 	/*
 	 * Figure out the maximum number of items-per-slab we'll have if
 	 * we're using the OFFPAGE slab header to track free items, given
 	 * all possible object sizes and the maximum desired wastage
 	 * (UMA_MAX_WASTE).
 	 *
 	 * We iterate until we find an object size for
 	 * which the calculated wastage in keg_small_init() will be
 	 * enough to warrant OFFPAGE.  Since wastedspace versus objsize
 	 * is an overall increasing see-saw function, we find the smallest
 	 * objsize such that the wastage is always acceptable for objects
 	 * with that objsize or smaller.  Since a smaller objsize always
 	 * generates a larger possible uma_max_ipers, we use this computed
 	 * objsize to calculate the largest ipers possible.  Since the
 	 * ipers calculated for OFFPAGE slab headers is always larger than
 	 * the ipers initially calculated in keg_small_init(), we use
 	 * the former's equation (UMA_SLAB_SIZE / keg->uk_rsize) to
 	 * obtain the maximum ipers possible for offpage slab headers.
 	 *
 	 * It should be noted that ipers versus objsize is an inversly
 	 * proportional function which drops off rather quickly so as
 	 * long as our UMA_MAX_WASTE is such that the objsize we calculate
 	 * falls into the portion of the inverse relation AFTER the steep
 	 * falloff, then uma_max_ipers shouldn't be too high (~10 on i386).
 	 *
 	 * Note that we have 8-bits (1 byte) to use as a freelist index
 	 * inside the actual slab header itself and this is enough to
 	 * accomodate us.  In the worst case, a UMA_SMALLEST_UNIT sized
 	 * object with offpage slab header would have ipers =
 	 * UMA_SLAB_SIZE / UMA_SMALLEST_UNIT (currently = 256), which is
 	 * 1 greater than what our byte-integer freelist index can
 	 * accomodate, but we know that this situation never occurs as
 	 * for UMA_SMALLEST_UNIT-sized objects, we will never calculate
 	 * that we need to go to offpage slab headers.  Or, if we do,
 	 * then we trap that condition below and panic in the INVARIANTS case.
 	 */
 	wsize = UMA_SLAB_SIZE - sizeof(struct uma_slab) - UMA_MAX_WASTE;
 	totsize = wsize;
 	objsize = UMA_SMALLEST_UNIT;
 	while (totsize >= wsize) {
 		totsize = (UMA_SLAB_SIZE - sizeof(struct uma_slab)) /
 		    (objsize + UMA_FRITM_SZ);
 		totsize *= (UMA_FRITM_SZ + objsize);
 		objsize++;
 	}
 	if (objsize > UMA_SMALLEST_UNIT)
 		objsize--;
 	uma_max_ipers = MAX(UMA_SLAB_SIZE / objsize, 64);
 
 	wsize = UMA_SLAB_SIZE - sizeof(struct uma_slab_refcnt) - UMA_MAX_WASTE;
 	totsize = wsize;
 	objsize = UMA_SMALLEST_UNIT;
 	while (totsize >= wsize) {
 		totsize = (UMA_SLAB_SIZE - sizeof(struct uma_slab_refcnt)) /
 		    (objsize + UMA_FRITMREF_SZ);
 		totsize *= (UMA_FRITMREF_SZ + objsize);
 		objsize++;
 	}
 	if (objsize > UMA_SMALLEST_UNIT)
 		objsize--;
 	uma_max_ipers_ref = MAX(UMA_SLAB_SIZE / objsize, 64);
 
 	KASSERT((uma_max_ipers_ref <= 255) && (uma_max_ipers <= 255),
 	    ("uma_startup: calculated uma_max_ipers values too large!"));
 
 #ifdef UMA_DEBUG
 	printf("Calculated uma_max_ipers (for OFFPAGE) is %d\n", uma_max_ipers);
 	printf("Calculated uma_max_ipers_ref (for OFFPAGE) is %d\n",
 	    uma_max_ipers_ref);
 #endif
 
 	/* "manually" create the initial zone */
 	args.name = "UMA Kegs";
 	args.size = sizeof(struct uma_keg);
 	args.ctor = keg_ctor;
 	args.dtor = keg_dtor;
 	args.uminit = zero_init;
 	args.fini = NULL;
 	args.keg = &masterkeg;
 	args.align = 32 - 1;
 	args.flags = UMA_ZFLAG_INTERNAL;
 	/* The initial zone has no Per cpu queues so it's smaller */
 	zone_ctor(kegs, sizeof(struct uma_zone), &args, M_WAITOK);
 
 #ifdef UMA_DEBUG
 	printf("Filling boot free list.\n");
 #endif
 	for (i = 0; i < boot_pages; i++) {
 		slab = (uma_slab_t)((u_int8_t *)bootmem + (i * UMA_SLAB_SIZE));
 		slab->us_data = (u_int8_t *)slab;
 		slab->us_flags = UMA_SLAB_BOOT;
 		LIST_INSERT_HEAD(&uma_boot_pages, slab, us_link);
 	}
 	mtx_init(&uma_boot_pages_mtx, "UMA boot pages", NULL, MTX_DEF);
 
 #ifdef UMA_DEBUG
 	printf("Creating uma zone headers zone and keg.\n");
 #endif
 	args.name = "UMA Zones";
 	args.size = sizeof(struct uma_zone) +
 	    (sizeof(struct uma_cache) * (mp_maxid + 1));
 	args.ctor = zone_ctor;
 	args.dtor = zone_dtor;
 	args.uminit = zero_init;
 	args.fini = NULL;
 	args.keg = NULL;
 	args.align = 32 - 1;
 	args.flags = UMA_ZFLAG_INTERNAL;
 	/* The initial zone has no Per cpu queues so it's smaller */
 	zone_ctor(zones, sizeof(struct uma_zone), &args, M_WAITOK);
 
 #ifdef UMA_DEBUG
 	printf("Initializing pcpu cache locks.\n");
 #endif
 #ifdef UMA_DEBUG
 	printf("Creating slab and hash zones.\n");
 #endif
 
 	/*
 	 * This is the max number of free list items we'll have with
 	 * offpage slabs.
 	 */
 	slabsize = uma_max_ipers * UMA_FRITM_SZ;
 	slabsize += sizeof(struct uma_slab);
 
 	/* Now make a zone for slab headers */
 	slabzone = uma_zcreate("UMA Slabs",
 				slabsize,
 				NULL, NULL, NULL, NULL,
 				UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL);
 
 	/*
 	 * We also create a zone for the bigger slabs with reference
 	 * counts in them, to accomodate UMA_ZONE_REFCNT zones.
 	 */
 	slabsize = uma_max_ipers_ref * UMA_FRITMREF_SZ;
 	slabsize += sizeof(struct uma_slab_refcnt);
 	slabrefzone = uma_zcreate("UMA RCntSlabs",
 				  slabsize,
 				  NULL, NULL, NULL, NULL,
 				  UMA_ALIGN_PTR,
 				  UMA_ZFLAG_INTERNAL);
 
 	hashzone = uma_zcreate("UMA Hash",
 	    sizeof(struct slabhead *) * UMA_HASH_SIZE_INIT,
 	    NULL, NULL, NULL, NULL,
 	    UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL);
 
 	bucket_init();
 
 	booted = UMA_STARTUP;
 
 #ifdef UMA_DEBUG
 	printf("UMA startup complete.\n");
 #endif
 }
 
 /* see uma.h */
 void
 uma_startup2(void)
 {
 	booted = UMA_STARTUP2;
 	bucket_enable();
 #ifdef UMA_DEBUG
 	printf("UMA startup2 complete.\n");
 #endif
 }
 
 /*
  * Initialize our callout handle
  *
  */
 
 static void
 uma_startup3(void)
 {
 #ifdef UMA_DEBUG
 	printf("Starting callout.\n");
 #endif
 	callout_init(&uma_callout, CALLOUT_MPSAFE);
 	callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL);
 #ifdef UMA_DEBUG
 	printf("UMA startup3 complete.\n");
 #endif
 }
 
 static uma_keg_t
 uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit, uma_fini fini,
 		int align, u_int32_t flags)
 {
 	struct uma_kctor_args args;
 
 	args.size = size;
 	args.uminit = uminit;
 	args.fini = fini;
 	args.align = (align == UMA_ALIGN_CACHE) ? uma_align_cache : align;
 	args.flags = flags;
 	args.zone = zone;
 	return (zone_alloc_item(kegs, &args, M_WAITOK));
 }
 
 /* See uma.h */
 void
 uma_set_align(int align)
 {
 
 	if (align != UMA_ALIGN_CACHE)
 		uma_align_cache = align;
 }
 
 /* See uma.h */
 uma_zone_t
 uma_zcreate(const char *name, size_t size, uma_ctor ctor, uma_dtor dtor,
 		uma_init uminit, uma_fini fini, int align, u_int32_t flags)
 
 {
 	struct uma_zctor_args args;
 
 	/* This stuff is essential for the zone ctor */
 	args.name = name;
 	args.size = size;
 	args.ctor = ctor;
 	args.dtor = dtor;
 	args.uminit = uminit;
 	args.fini = fini;
 	args.align = align;
 	args.flags = flags;
 	args.keg = NULL;
 
 	return (zone_alloc_item(zones, &args, M_WAITOK));
 }
 
 /* See uma.h */
 uma_zone_t
 uma_zsecond_create(char *name, uma_ctor ctor, uma_dtor dtor,
 		    uma_init zinit, uma_fini zfini, uma_zone_t master)
 {
 	struct uma_zctor_args args;
 	uma_keg_t keg;
 
 	keg = zone_first_keg(master);
 	args.name = name;
 	args.size = keg->uk_size;
 	args.ctor = ctor;
 	args.dtor = dtor;
 	args.uminit = zinit;
 	args.fini = zfini;
 	args.align = keg->uk_align;
 	args.flags = keg->uk_flags | UMA_ZONE_SECONDARY;
 	args.keg = keg;
 
 	/* XXX Attaches only one keg of potentially many. */
 	return (zone_alloc_item(zones, &args, M_WAITOK));
 }
 
 static void
 zone_lock_pair(uma_zone_t a, uma_zone_t b)
 {
 	if (a < b) {
 		ZONE_LOCK(a);
 		mtx_lock_flags(b->uz_lock, MTX_DUPOK);
 	} else {
 		ZONE_LOCK(b);
 		mtx_lock_flags(a->uz_lock, MTX_DUPOK);
 	}
 }
 
 static void
 zone_unlock_pair(uma_zone_t a, uma_zone_t b)
 {
 
 	ZONE_UNLOCK(a);
 	ZONE_UNLOCK(b);
 }
 
 int
 uma_zsecond_add(uma_zone_t zone, uma_zone_t master)
 {
 	uma_klink_t klink;
 	uma_klink_t kl;
 	int error;
 
 	error = 0;
 	klink = malloc(sizeof(*klink), M_TEMP, M_WAITOK | M_ZERO);
 
 	zone_lock_pair(zone, master);
 	/*
 	 * zone must use vtoslab() to resolve objects and must already be
 	 * a secondary.
 	 */
 	if ((zone->uz_flags & (UMA_ZONE_VTOSLAB | UMA_ZONE_SECONDARY))
 	    != (UMA_ZONE_VTOSLAB | UMA_ZONE_SECONDARY)) {
 		error = EINVAL;
 		goto out;
 	}
 	/*
 	 * The new master must also use vtoslab().
 	 */
 	if ((zone->uz_flags & UMA_ZONE_VTOSLAB) != UMA_ZONE_VTOSLAB) {
 		error = EINVAL;
 		goto out;
 	}
 	/*
 	 * Both must either be refcnt, or not be refcnt.
 	 */
 	if ((zone->uz_flags & UMA_ZONE_REFCNT) !=
 	    (master->uz_flags & UMA_ZONE_REFCNT)) {
 		error = EINVAL;
 		goto out;
 	}
 	/*
 	 * The underlying object must be the same size.  rsize
 	 * may be different.
 	 */
 	if (master->uz_size != zone->uz_size) {
 		error = E2BIG;
 		goto out;
 	}
 	/*
 	 * Put it at the end of the list.
 	 */
 	klink->kl_keg = zone_first_keg(master);
 	LIST_FOREACH(kl, &zone->uz_kegs, kl_link) {
 		if (LIST_NEXT(kl, kl_link) == NULL) {
 			LIST_INSERT_AFTER(kl, klink, kl_link);
 			break;
 		}
 	}
 	klink = NULL;
 	zone->uz_flags |= UMA_ZFLAG_MULTI;
 	zone->uz_slab = zone_fetch_slab_multi;
 
 out:
 	zone_unlock_pair(zone, master);
 	if (klink != NULL)
 		free(klink, M_TEMP);
 
 	return (error);
 }
 
 
 /* See uma.h */
 void
 uma_zdestroy(uma_zone_t zone)
 {
 
 	zone_free_item(zones, zone, NULL, SKIP_NONE, ZFREE_STATFREE);
 }
 
 /* See uma.h */
 void *
 uma_zalloc_arg(uma_zone_t zone, void *udata, int flags)
 {
 	void *item;
 	uma_cache_t cache;
 	uma_bucket_t bucket;
 	int cpu;
 
 	/* This is the fast path allocation */
 #ifdef UMA_DEBUG_ALLOC_1
 	printf("Allocating one item from %s(%p)\n", zone->uz_name, zone);
 #endif
 	CTR3(KTR_UMA, "uma_zalloc_arg thread %x zone %s flags %d", curthread,
 	    zone->uz_name, flags);
 
 	if (flags & M_WAITOK) {
 		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
 		    "uma_zalloc_arg: zone \"%s\"", zone->uz_name);
 	}
 #ifdef DEBUG_MEMGUARD
 	if (memguard_cmp_zone(zone)) {
 		item = memguard_alloc(zone->uz_size, flags);
 		if (item != NULL) {
 			/*
 			 * Avoid conflict with the use-after-free
 			 * protecting infrastructure from INVARIANTS.
 			 */
 			if (zone->uz_init != NULL &&
 			    zone->uz_init != mtrash_init &&
 			    zone->uz_init(item, zone->uz_size, flags) != 0)
 				return (NULL);
 			if (zone->uz_ctor != NULL &&
 			    zone->uz_ctor != mtrash_ctor &&
 			    zone->uz_ctor(item, zone->uz_size, udata, flags) != 0) {
 			    	zone->uz_fini(item, zone->uz_size);
 				return (NULL);
 			}
 			return (item);
 		}
 		/* This is unfortunate but should not be fatal. */
 	}
 #endif
 	/*
 	 * If possible, allocate from the per-CPU cache.  There are two
 	 * requirements for safe access to the per-CPU cache: (1) the thread
 	 * accessing the cache must not be preempted or yield during access,
 	 * and (2) the thread must not migrate CPUs without switching which
 	 * cache it accesses.  We rely on a critical section to prevent
 	 * preemption and migration.  We release the critical section in
 	 * order to acquire the zone mutex if we are unable to allocate from
 	 * the current cache; when we re-acquire the critical section, we
 	 * must detect and handle migration if it has occurred.
 	 */
 zalloc_restart:
 	critical_enter();
 	cpu = curcpu;
 	cache = &zone->uz_cpu[cpu];
 
 zalloc_start:
 	bucket = cache->uc_allocbucket;
 
 	if (bucket) {
 		if (bucket->ub_cnt > 0) {
 			bucket->ub_cnt--;
 			item = bucket->ub_bucket[bucket->ub_cnt];
 #ifdef INVARIANTS
 			bucket->ub_bucket[bucket->ub_cnt] = NULL;
 #endif
 			KASSERT(item != NULL,
 			    ("uma_zalloc: Bucket pointer mangled."));
 			cache->uc_allocs++;
 			critical_exit();
 #ifdef INVARIANTS
 			ZONE_LOCK(zone);
 			uma_dbg_alloc(zone, NULL, item);
 			ZONE_UNLOCK(zone);
 #endif
 			if (zone->uz_ctor != NULL) {
 				if (zone->uz_ctor(item, zone->uz_size,
 				    udata, flags) != 0) {
 					zone_free_item(zone, item, udata,
 					    SKIP_DTOR, ZFREE_STATFAIL |
 					    ZFREE_STATFREE);
 					return (NULL);
 				}
 			}
 			if (flags & M_ZERO)
 				bzero(item, zone->uz_size);
 			return (item);
 		} else if (cache->uc_freebucket) {
 			/*
 			 * We have run out of items in our allocbucket.
 			 * See if we can switch with our free bucket.
 			 */
 			if (cache->uc_freebucket->ub_cnt > 0) {
 #ifdef UMA_DEBUG_ALLOC
 				printf("uma_zalloc: Swapping empty with"
 				    " alloc.\n");
 #endif
 				bucket = cache->uc_freebucket;
 				cache->uc_freebucket = cache->uc_allocbucket;
 				cache->uc_allocbucket = bucket;
 
 				goto zalloc_start;
 			}
 		}
 	}
 	/*
 	 * Attempt to retrieve the item from the per-CPU cache has failed, so
 	 * we must go back to the zone.  This requires the zone lock, so we
 	 * must drop the critical section, then re-acquire it when we go back
 	 * to the cache.  Since the critical section is released, we may be
 	 * preempted or migrate.  As such, make sure not to maintain any
 	 * thread-local state specific to the cache from prior to releasing
 	 * the critical section.
 	 */
 	critical_exit();
 	ZONE_LOCK(zone);
 	critical_enter();
 	cpu = curcpu;
 	cache = &zone->uz_cpu[cpu];
 	bucket = cache->uc_allocbucket;
 	if (bucket != NULL) {
 		if (bucket->ub_cnt > 0) {
 			ZONE_UNLOCK(zone);
 			goto zalloc_start;
 		}
 		bucket = cache->uc_freebucket;
 		if (bucket != NULL && bucket->ub_cnt > 0) {
 			ZONE_UNLOCK(zone);
 			goto zalloc_start;
 		}
 	}
 
 	/* Since we have locked the zone we may as well send back our stats */
 	zone->uz_allocs += cache->uc_allocs;
 	cache->uc_allocs = 0;
 	zone->uz_frees += cache->uc_frees;
 	cache->uc_frees = 0;
 
 	/* Our old one is now a free bucket */
 	if (cache->uc_allocbucket) {
 		KASSERT(cache->uc_allocbucket->ub_cnt == 0,
 		    ("uma_zalloc_arg: Freeing a non free bucket."));
 		LIST_INSERT_HEAD(&zone->uz_free_bucket,
 		    cache->uc_allocbucket, ub_link);
 		cache->uc_allocbucket = NULL;
 	}
 
 	/* Check the free list for a new alloc bucket */
 	if ((bucket = LIST_FIRST(&zone->uz_full_bucket)) != NULL) {
 		KASSERT(bucket->ub_cnt != 0,
 		    ("uma_zalloc_arg: Returning an empty bucket."));
 
 		LIST_REMOVE(bucket, ub_link);
 		cache->uc_allocbucket = bucket;
 		ZONE_UNLOCK(zone);
 		goto zalloc_start;
 	}
 	/* We are no longer associated with this CPU. */
 	critical_exit();
 
 	/* Bump up our uz_count so we get here less */
 	if (zone->uz_count < BUCKET_MAX)
 		zone->uz_count++;
 
 	/*
 	 * Now lets just fill a bucket and put it on the free list.  If that
 	 * works we'll restart the allocation from the begining.
 	 */
 	if (zone_alloc_bucket(zone, flags)) {
 		ZONE_UNLOCK(zone);
 		goto zalloc_restart;
 	}
 	ZONE_UNLOCK(zone);
 	/*
 	 * We may not be able to get a bucket so return an actual item.
 	 */
 #ifdef UMA_DEBUG
 	printf("uma_zalloc_arg: Bucketzone returned NULL\n");
 #endif
 
 	item = zone_alloc_item(zone, udata, flags);
 	return (item);
 }
 
 static uma_slab_t
 keg_fetch_slab(uma_keg_t keg, uma_zone_t zone, int flags)
 {
 	uma_slab_t slab;
 
 	mtx_assert(&keg->uk_lock, MA_OWNED);
 	slab = NULL;
 
 	for (;;) {
 		/*
 		 * Find a slab with some space.  Prefer slabs that are partially
 		 * used over those that are totally full.  This helps to reduce
 		 * fragmentation.
 		 */
 		if (keg->uk_free != 0) {
 			if (!LIST_EMPTY(&keg->uk_part_slab)) {
 				slab = LIST_FIRST(&keg->uk_part_slab);
 			} else {
 				slab = LIST_FIRST(&keg->uk_free_slab);
 				LIST_REMOVE(slab, us_link);
 				LIST_INSERT_HEAD(&keg->uk_part_slab, slab,
 				    us_link);
 			}
 			MPASS(slab->us_keg == keg);
 			return (slab);
 		}
 
 		/*
 		 * M_NOVM means don't ask at all!
 		 */
 		if (flags & M_NOVM)
 			break;
 
 		if (keg->uk_maxpages && keg->uk_pages >= keg->uk_maxpages) {
 			keg->uk_flags |= UMA_ZFLAG_FULL;
 			/*
 			 * If this is not a multi-zone, set the FULL bit.
 			 * Otherwise slab_multi() takes care of it.
 			 */
 			if ((zone->uz_flags & UMA_ZFLAG_MULTI) == 0) {
 				zone->uz_flags |= UMA_ZFLAG_FULL;
 				zone_log_warning(zone);
 			}
 			if (flags & M_NOWAIT)
 				break;
 			zone->uz_sleeps++;
 			msleep(keg, &keg->uk_lock, PVM, "keglimit", 0);
 			continue;
 		}
 		keg->uk_recurse++;
 		slab = keg_alloc_slab(keg, zone, flags);
 		keg->uk_recurse--;
 		/*
 		 * If we got a slab here it's safe to mark it partially used
 		 * and return.  We assume that the caller is going to remove
 		 * at least one item.
 		 */
 		if (slab) {
 			MPASS(slab->us_keg == keg);
 			LIST_INSERT_HEAD(&keg->uk_part_slab, slab, us_link);
 			return (slab);
 		}
 		/*
 		 * We might not have been able to get a slab but another cpu
 		 * could have while we were unlocked.  Check again before we
 		 * fail.
 		 */
 		flags |= M_NOVM;
 	}
 	return (slab);
 }
 
 static inline void
 zone_relock(uma_zone_t zone, uma_keg_t keg)
 {
 	if (zone->uz_lock != &keg->uk_lock) {
 		KEG_UNLOCK(keg);
 		ZONE_LOCK(zone);
 	}
 }
 
 static inline void
 keg_relock(uma_keg_t keg, uma_zone_t zone)
 {
 	if (zone->uz_lock != &keg->uk_lock) {
 		ZONE_UNLOCK(zone);
 		KEG_LOCK(keg);
 	}
 }
 
 static uma_slab_t
 zone_fetch_slab(uma_zone_t zone, uma_keg_t keg, int flags)
 {
 	uma_slab_t slab;
 
 	if (keg == NULL)
 		keg = zone_first_keg(zone);
 	/*
 	 * This is to prevent us from recursively trying to allocate
 	 * buckets.  The problem is that if an allocation forces us to
 	 * grab a new bucket we will call page_alloc, which will go off
 	 * and cause the vm to allocate vm_map_entries.  If we need new
 	 * buckets there too we will recurse in kmem_alloc and bad
 	 * things happen.  So instead we return a NULL bucket, and make
 	 * the code that allocates buckets smart enough to deal with it
 	 */
 	if (keg->uk_flags & UMA_ZFLAG_BUCKET && keg->uk_recurse != 0)
 		return (NULL);
 
 	for (;;) {
 		slab = keg_fetch_slab(keg, zone, flags);
 		if (slab)
 			return (slab);
 		if (flags & (M_NOWAIT | M_NOVM))
 			break;
 	}
 	return (NULL);
 }
 
 /*
  * uma_zone_fetch_slab_multi:  Fetches a slab from one available keg.  Returns
  * with the keg locked.  Caller must call zone_relock() afterwards if the
  * zone lock is required.  On NULL the zone lock is held.
  *
  * The last pointer is used to seed the search.  It is not required.
  */
 static uma_slab_t
 zone_fetch_slab_multi(uma_zone_t zone, uma_keg_t last, int rflags)
 {
 	uma_klink_t klink;
 	uma_slab_t slab;
 	uma_keg_t keg;
 	int flags;
 	int empty;
 	int full;
 
 	/*
 	 * Don't wait on the first pass.  This will skip limit tests
 	 * as well.  We don't want to block if we can find a provider
 	 * without blocking.
 	 */
 	flags = (rflags & ~M_WAITOK) | M_NOWAIT;
 	/*
 	 * Use the last slab allocated as a hint for where to start
 	 * the search.
 	 */
 	if (last) {
 		slab = keg_fetch_slab(last, zone, flags);
 		if (slab)
 			return (slab);
 		zone_relock(zone, last);
 		last = NULL;
 	}
 	/*
 	 * Loop until we have a slab incase of transient failures
 	 * while M_WAITOK is specified.  I'm not sure this is 100%
 	 * required but we've done it for so long now.
 	 */
 	for (;;) {
 		empty = 0;
 		full = 0;
 		/*
 		 * Search the available kegs for slabs.  Be careful to hold the
 		 * correct lock while calling into the keg layer.
 		 */
 		LIST_FOREACH(klink, &zone->uz_kegs, kl_link) {
 			keg = klink->kl_keg;
 			keg_relock(keg, zone);
 			if ((keg->uk_flags & UMA_ZFLAG_FULL) == 0) {
 				slab = keg_fetch_slab(keg, zone, flags);
 				if (slab)
 					return (slab);
 			}
 			if (keg->uk_flags & UMA_ZFLAG_FULL)
 				full++;
 			else
 				empty++;
 			zone_relock(zone, keg);
 		}
 		if (rflags & (M_NOWAIT | M_NOVM))
 			break;
 		flags = rflags;
 		/*
 		 * All kegs are full.  XXX We can't atomically check all kegs
 		 * and sleep so just sleep for a short period and retry.
 		 */
 		if (full && !empty) {
 			zone->uz_flags |= UMA_ZFLAG_FULL;
 			zone->uz_sleeps++;
 			zone_log_warning(zone);
 			msleep(zone, zone->uz_lock, PVM, "zonelimit", hz/100);
 			zone->uz_flags &= ~UMA_ZFLAG_FULL;
 			continue;
 		}
 	}
 	return (NULL);
 }
 
 static void *
 slab_alloc_item(uma_zone_t zone, uma_slab_t slab)
 {
 	uma_keg_t keg;
 	uma_slabrefcnt_t slabref;
 	void *item;
 	u_int8_t freei;
 
 	keg = slab->us_keg;
 	mtx_assert(&keg->uk_lock, MA_OWNED);
 
 	freei = slab->us_firstfree;
 	if (keg->uk_flags & UMA_ZONE_REFCNT) {
 		slabref = (uma_slabrefcnt_t)slab;
 		slab->us_firstfree = slabref->us_freelist[freei].us_item;
 	} else {
 		slab->us_firstfree = slab->us_freelist[freei].us_item;
 	}
 	item = slab->us_data + (keg->uk_rsize * freei);
 
 	slab->us_freecount--;
 	keg->uk_free--;
 #ifdef INVARIANTS
 	uma_dbg_alloc(zone, slab, item);
 #endif
 	/* Move this slab to the full list */
 	if (slab->us_freecount == 0) {
 		LIST_REMOVE(slab, us_link);
 		LIST_INSERT_HEAD(&keg->uk_full_slab, slab, us_link);
 	}
 
 	return (item);
 }
 
 static int
 zone_alloc_bucket(uma_zone_t zone, int flags)
 {
 	uma_bucket_t bucket;
 	uma_slab_t slab;
 	uma_keg_t keg;
 	int16_t saved;
 	int max, origflags = flags;
 
 	/*
 	 * Try this zone's free list first so we don't allocate extra buckets.
 	 */
 	if ((bucket = LIST_FIRST(&zone->uz_free_bucket)) != NULL) {
 		KASSERT(bucket->ub_cnt == 0,
 		    ("zone_alloc_bucket: Bucket on free list is not empty."));
 		LIST_REMOVE(bucket, ub_link);
 	} else {
 		int bflags;
 
 		bflags = (flags & ~M_ZERO);
 		if (zone->uz_flags & UMA_ZFLAG_CACHEONLY)
 			bflags |= M_NOVM;
 
 		ZONE_UNLOCK(zone);
 		bucket = bucket_alloc(zone->uz_count, bflags);
 		ZONE_LOCK(zone);
 	}
 
 	if (bucket == NULL) {
 		return (0);
 	}
 
 #ifdef SMP
 	/*
 	 * This code is here to limit the number of simultaneous bucket fills
 	 * for any given zone to the number of per cpu caches in this zone. This
 	 * is done so that we don't allocate more memory than we really need.
 	 */
 	if (zone->uz_fills >= mp_ncpus)
 		goto done;
 
 #endif
 	zone->uz_fills++;
 
 	max = MIN(bucket->ub_entries, zone->uz_count);
 	/* Try to keep the buckets totally full */
 	saved = bucket->ub_cnt;
 	slab = NULL;
 	keg = NULL;
 	while (bucket->ub_cnt < max &&
 	    (slab = zone->uz_slab(zone, keg, flags)) != NULL) {
 		keg = slab->us_keg;
 		while (slab->us_freecount && bucket->ub_cnt < max) {
 			bucket->ub_bucket[bucket->ub_cnt++] =
 			    slab_alloc_item(zone, slab);
 		}
 
 		/* Don't block on the next fill */
 		flags |= M_NOWAIT;
 	}
 	if (slab)
 		zone_relock(zone, keg);
 
 	/*
 	 * We unlock here because we need to call the zone's init.
 	 * It should be safe to unlock because the slab dealt with
 	 * above is already on the appropriate list within the keg
 	 * and the bucket we filled is not yet on any list, so we
 	 * own it.
 	 */
 	if (zone->uz_init != NULL) {
 		int i;
 
 		ZONE_UNLOCK(zone);
 		for (i = saved; i < bucket->ub_cnt; i++)
 			if (zone->uz_init(bucket->ub_bucket[i], zone->uz_size,
 			    origflags) != 0)
 				break;
 		/*
 		 * If we couldn't initialize the whole bucket, put the
 		 * rest back onto the freelist.
 		 */
 		if (i != bucket->ub_cnt) {
 			int j;
 
 			for (j = i; j < bucket->ub_cnt; j++) {
 				zone_free_item(zone, bucket->ub_bucket[j],
 				    NULL, SKIP_FINI, 0);
 #ifdef INVARIANTS
 				bucket->ub_bucket[j] = NULL;
 #endif
 			}
 			bucket->ub_cnt = i;
 		}
 		ZONE_LOCK(zone);
 	}
 
 	zone->uz_fills--;
 	if (bucket->ub_cnt != 0) {
 		LIST_INSERT_HEAD(&zone->uz_full_bucket,
 		    bucket, ub_link);
 		return (1);
 	}
 #ifdef SMP
 done:
 #endif
 	bucket_free(bucket);
 
 	return (0);
 }
 /*
  * Allocates an item for an internal zone
  *
  * Arguments
  *	zone   The zone to alloc for.
  *	udata  The data to be passed to the constructor.
  *	flags  M_WAITOK, M_NOWAIT, M_ZERO.
  *
  * Returns
  *	NULL if there is no memory and M_NOWAIT is set
  *	An item if successful
  */
 
 static void *
 zone_alloc_item(uma_zone_t zone, void *udata, int flags)
 {
 	uma_slab_t slab;
 	void *item;
 
 	item = NULL;
 
 #ifdef UMA_DEBUG_ALLOC
 	printf("INTERNAL: Allocating one item from %s(%p)\n", zone->uz_name, zone);
 #endif
 	ZONE_LOCK(zone);
 
 	slab = zone->uz_slab(zone, NULL, flags);
 	if (slab == NULL) {
 		zone->uz_fails++;
 		ZONE_UNLOCK(zone);
 		return (NULL);
 	}
 
 	item = slab_alloc_item(zone, slab);
 
 	zone_relock(zone, slab->us_keg);
 	zone->uz_allocs++;
 	ZONE_UNLOCK(zone);
 
 	/*
 	 * We have to call both the zone's init (not the keg's init)
 	 * and the zone's ctor.  This is because the item is going from
 	 * a keg slab directly to the user, and the user is expecting it
 	 * to be both zone-init'd as well as zone-ctor'd.
 	 */
 	if (zone->uz_init != NULL) {
 		if (zone->uz_init(item, zone->uz_size, flags) != 0) {
 			zone_free_item(zone, item, udata, SKIP_FINI,
 			    ZFREE_STATFAIL | ZFREE_STATFREE);
 			return (NULL);
 		}
 	}
 	if (zone->uz_ctor != NULL) {
 		if (zone->uz_ctor(item, zone->uz_size, udata, flags) != 0) {
 			zone_free_item(zone, item, udata, SKIP_DTOR,
 			    ZFREE_STATFAIL | ZFREE_STATFREE);
 			return (NULL);
 		}
 	}
 	if (flags & M_ZERO)
 		bzero(item, zone->uz_size);
 
 	return (item);
 }
 
 /* See uma.h */
 void
 uma_zfree_arg(uma_zone_t zone, void *item, void *udata)
 {
 	uma_cache_t cache;
 	uma_bucket_t bucket;
 	int bflags;
 	int cpu;
 
 #ifdef UMA_DEBUG_ALLOC_1
 	printf("Freeing item %p to %s(%p)\n", item, zone->uz_name, zone);
 #endif
 	CTR2(KTR_UMA, "uma_zfree_arg thread %x zone %s", curthread,
 	    zone->uz_name);
 
         /* uma_zfree(..., NULL) does nothing, to match free(9). */
         if (item == NULL)
                 return;
 #ifdef DEBUG_MEMGUARD
 	if (is_memguard_addr(item)) {
 		if (zone->uz_dtor != NULL && zone->uz_dtor != mtrash_dtor)
 			zone->uz_dtor(item, zone->uz_size, udata);
 		if (zone->uz_fini != NULL && zone->uz_fini != mtrash_fini)
 			zone->uz_fini(item, zone->uz_size);
 		memguard_free(item);
 		return;
 	}
 #endif
 	if (zone->uz_dtor)
 		zone->uz_dtor(item, zone->uz_size, udata);
 
 #ifdef INVARIANTS
 	ZONE_LOCK(zone);
 	if (zone->uz_flags & UMA_ZONE_MALLOC)
 		uma_dbg_free(zone, udata, item);
 	else
 		uma_dbg_free(zone, NULL, item);
 	ZONE_UNLOCK(zone);
 #endif
 	/*
 	 * The race here is acceptable.  If we miss it we'll just have to wait
 	 * a little longer for the limits to be reset.
 	 */
 	if (zone->uz_flags & UMA_ZFLAG_FULL)
 		goto zfree_internal;
 
 	/*
 	 * If possible, free to the per-CPU cache.  There are two
 	 * requirements for safe access to the per-CPU cache: (1) the thread
 	 * accessing the cache must not be preempted or yield during access,
 	 * and (2) the thread must not migrate CPUs without switching which
 	 * cache it accesses.  We rely on a critical section to prevent
 	 * preemption and migration.  We release the critical section in
 	 * order to acquire the zone mutex if we are unable to free to the
 	 * current cache; when we re-acquire the critical section, we must
 	 * detect and handle migration if it has occurred.
 	 */
 zfree_restart:
 	critical_enter();
 	cpu = curcpu;
 	cache = &zone->uz_cpu[cpu];
 
 zfree_start:
 	bucket = cache->uc_freebucket;
 
 	if (bucket) {
 		/*
 		 * Do we have room in our bucket? It is OK for this uz count
 		 * check to be slightly out of sync.
 		 */
 
 		if (bucket->ub_cnt < bucket->ub_entries) {
 			KASSERT(bucket->ub_bucket[bucket->ub_cnt] == NULL,
 			    ("uma_zfree: Freeing to non free bucket index."));
 			bucket->ub_bucket[bucket->ub_cnt] = item;
 			bucket->ub_cnt++;
 			cache->uc_frees++;
 			critical_exit();
 			return;
 		} else if (cache->uc_allocbucket) {
 #ifdef UMA_DEBUG_ALLOC
 			printf("uma_zfree: Swapping buckets.\n");
 #endif
 			/*
 			 * We have run out of space in our freebucket.
 			 * See if we can switch with our alloc bucket.
 			 */
 			if (cache->uc_allocbucket->ub_cnt <
 			    cache->uc_freebucket->ub_cnt) {
 				bucket = cache->uc_freebucket;
 				cache->uc_freebucket = cache->uc_allocbucket;
 				cache->uc_allocbucket = bucket;
 				goto zfree_start;
 			}
 		}
 	}
 	/*
 	 * We can get here for two reasons:
 	 *
 	 * 1) The buckets are NULL
 	 * 2) The alloc and free buckets are both somewhat full.
 	 *
 	 * We must go back the zone, which requires acquiring the zone lock,
 	 * which in turn means we must release and re-acquire the critical
 	 * section.  Since the critical section is released, we may be
 	 * preempted or migrate.  As such, make sure not to maintain any
 	 * thread-local state specific to the cache from prior to releasing
 	 * the critical section.
 	 */
 	critical_exit();
 	ZONE_LOCK(zone);
 	critical_enter();
 	cpu = curcpu;
 	cache = &zone->uz_cpu[cpu];
 	if (cache->uc_freebucket != NULL) {
 		if (cache->uc_freebucket->ub_cnt <
 		    cache->uc_freebucket->ub_entries) {
 			ZONE_UNLOCK(zone);
 			goto zfree_start;
 		}
 		if (cache->uc_allocbucket != NULL &&
 		    (cache->uc_allocbucket->ub_cnt <
 		    cache->uc_freebucket->ub_cnt)) {
 			ZONE_UNLOCK(zone);
 			goto zfree_start;
 		}
 	}
 
 	/* Since we have locked the zone we may as well send back our stats */
 	zone->uz_allocs += cache->uc_allocs;
 	cache->uc_allocs = 0;
 	zone->uz_frees += cache->uc_frees;
 	cache->uc_frees = 0;
 
 	bucket = cache->uc_freebucket;
 	cache->uc_freebucket = NULL;
 
 	/* Can we throw this on the zone full list? */
 	if (bucket != NULL) {
 #ifdef UMA_DEBUG_ALLOC
 		printf("uma_zfree: Putting old bucket on the free list.\n");
 #endif
 		/* ub_cnt is pointing to the last free item */
 		KASSERT(bucket->ub_cnt != 0,
 		    ("uma_zfree: Attempting to insert an empty bucket onto the full list.\n"));
 		LIST_INSERT_HEAD(&zone->uz_full_bucket,
 		    bucket, ub_link);
 	}
 	if ((bucket = LIST_FIRST(&zone->uz_free_bucket)) != NULL) {
 		LIST_REMOVE(bucket, ub_link);
 		ZONE_UNLOCK(zone);
 		cache->uc_freebucket = bucket;
 		goto zfree_start;
 	}
 	/* We are no longer associated with this CPU. */
 	critical_exit();
 
 	/* And the zone.. */
 	ZONE_UNLOCK(zone);
 
 #ifdef UMA_DEBUG_ALLOC
 	printf("uma_zfree: Allocating new free bucket.\n");
 #endif
 	bflags = M_NOWAIT;
 
 	if (zone->uz_flags & UMA_ZFLAG_CACHEONLY)
 		bflags |= M_NOVM;
 	bucket = bucket_alloc(zone->uz_count, bflags);
 	if (bucket) {
 		ZONE_LOCK(zone);
 		LIST_INSERT_HEAD(&zone->uz_free_bucket,
 		    bucket, ub_link);
 		ZONE_UNLOCK(zone);
 		goto zfree_restart;
 	}
 
 	/*
 	 * If nothing else caught this, we'll just do an internal free.
 	 */
 zfree_internal:
 	zone_free_item(zone, item, udata, SKIP_DTOR, ZFREE_STATFREE);
 
 	return;
 }
 
 /*
  * Frees an item to an INTERNAL zone or allocates a free bucket
  *
  * Arguments:
  *	zone   The zone to free to
  *	item   The item we're freeing
  *	udata  User supplied data for the dtor
  *	skip   Skip dtors and finis
  */
 static void
 zone_free_item(uma_zone_t zone, void *item, void *udata,
     enum zfreeskip skip, int flags)
 {
 	uma_slab_t slab;
 	uma_slabrefcnt_t slabref;
 	uma_keg_t keg;
 	u_int8_t *mem;
 	u_int8_t freei;
 	int clearfull;
 
 	if (skip < SKIP_DTOR && zone->uz_dtor)
 		zone->uz_dtor(item, zone->uz_size, udata);
 
 	if (skip < SKIP_FINI && zone->uz_fini)
 		zone->uz_fini(item, zone->uz_size);
 
 	ZONE_LOCK(zone);
 
 	if (flags & ZFREE_STATFAIL)
 		zone->uz_fails++;
 	if (flags & ZFREE_STATFREE)
 		zone->uz_frees++;
 
 	if (!(zone->uz_flags & UMA_ZONE_VTOSLAB)) {
 		mem = (u_int8_t *)((unsigned long)item & (~UMA_SLAB_MASK));
 		keg = zone_first_keg(zone); /* Must only be one. */
 		if (zone->uz_flags & UMA_ZONE_HASH) {
 			slab = hash_sfind(&keg->uk_hash, mem);
 		} else {
 			mem += keg->uk_pgoff;
 			slab = (uma_slab_t)mem;
 		}
 	} else {
 		/* This prevents redundant lookups via free(). */
 		if ((zone->uz_flags & UMA_ZONE_MALLOC) && udata != NULL)
 			slab = (uma_slab_t)udata;
 		else
 			slab = vtoslab((vm_offset_t)item);
 		keg = slab->us_keg;
 		keg_relock(keg, zone);
 	}
 	MPASS(keg == slab->us_keg);
 
 	/* Do we need to remove from any lists? */
 	if (slab->us_freecount+1 == keg->uk_ipers) {
 		LIST_REMOVE(slab, us_link);
 		LIST_INSERT_HEAD(&keg->uk_free_slab, slab, us_link);
 	} else if (slab->us_freecount == 0) {
 		LIST_REMOVE(slab, us_link);
 		LIST_INSERT_HEAD(&keg->uk_part_slab, slab, us_link);
 	}
 
 	/* Slab management stuff */
 	freei = ((unsigned long)item - (unsigned long)slab->us_data)
 		/ keg->uk_rsize;
 
 #ifdef INVARIANTS
 	if (!skip)
 		uma_dbg_free(zone, slab, item);
 #endif
 
 	if (keg->uk_flags & UMA_ZONE_REFCNT) {
 		slabref = (uma_slabrefcnt_t)slab;
 		slabref->us_freelist[freei].us_item = slab->us_firstfree;
 	} else {
 		slab->us_freelist[freei].us_item = slab->us_firstfree;
 	}
 	slab->us_firstfree = freei;
 	slab->us_freecount++;
 
 	/* Zone statistics */
 	keg->uk_free++;
 
 	clearfull = 0;
 	if (keg->uk_flags & UMA_ZFLAG_FULL) {
 		if (keg->uk_pages < keg->uk_maxpages) {
 			keg->uk_flags &= ~UMA_ZFLAG_FULL;
 			clearfull = 1;
 		}
 
 		/* 
 		 * We can handle one more allocation. Since we're clearing ZFLAG_FULL,
 		 * wake up all procs blocked on pages. This should be uncommon, so 
 		 * keeping this simple for now (rather than adding count of blocked 
 		 * threads etc).
 		 */
 		wakeup(keg);
 	}
 	if (clearfull) {
 		zone_relock(zone, keg);
 		zone->uz_flags &= ~UMA_ZFLAG_FULL;
 		wakeup(zone);
 		ZONE_UNLOCK(zone);
 	} else
 		KEG_UNLOCK(keg);
 }
 
 /* See uma.h */
 int
 uma_zone_set_max(uma_zone_t zone, int nitems)
 {
 	uma_keg_t keg;
 
 	ZONE_LOCK(zone);
 	keg = zone_first_keg(zone);
 	keg->uk_maxpages = (nitems / keg->uk_ipers) * keg->uk_ppera;
 	if (keg->uk_maxpages * keg->uk_ipers < nitems)
 		keg->uk_maxpages += keg->uk_ppera;
 	nitems = keg->uk_maxpages * keg->uk_ipers;
 	ZONE_UNLOCK(zone);
 
 	return (nitems);
 }
 
 /* See uma.h */
 int
 uma_zone_get_max(uma_zone_t zone)
 {
 	int nitems;
 	uma_keg_t keg;
 
 	ZONE_LOCK(zone);
 	keg = zone_first_keg(zone);
 	nitems = keg->uk_maxpages * keg->uk_ipers;
 	ZONE_UNLOCK(zone);
 
 	return (nitems);
 }
 
 /* See uma.h */
 void
 uma_zone_set_warning(uma_zone_t zone, const char *warning)
 {
 
 	ZONE_LOCK(zone);
 	zone->uz_warning = warning;
 	ZONE_UNLOCK(zone);
 }
 
 /* See uma.h */
 int
 uma_zone_get_cur(uma_zone_t zone)
 {
 	int64_t nitems;
 	u_int i;
 
 	ZONE_LOCK(zone);
 	nitems = zone->uz_allocs - zone->uz_frees;
 	CPU_FOREACH(i) {
 		/*
 		 * See the comment in sysctl_vm_zone_stats() regarding the
 		 * safety of accessing the per-cpu caches. With the zone lock
 		 * held, it is safe, but can potentially result in stale data.
 		 */
 		nitems += zone->uz_cpu[i].uc_allocs -
 		    zone->uz_cpu[i].uc_frees;
 	}
 	ZONE_UNLOCK(zone);
 
 	return (nitems < 0 ? 0 : nitems);
 }
 
 /* See uma.h */
 void
 uma_zone_set_init(uma_zone_t zone, uma_init uminit)
 {
 	uma_keg_t keg;
 
 	ZONE_LOCK(zone);
 	keg = zone_first_keg(zone);
 	KASSERT(keg->uk_pages == 0,
 	    ("uma_zone_set_init on non-empty keg"));
 	keg->uk_init = uminit;
 	ZONE_UNLOCK(zone);
 }
 
 /* See uma.h */
 void
 uma_zone_set_fini(uma_zone_t zone, uma_fini fini)
 {
 	uma_keg_t keg;
 
 	ZONE_LOCK(zone);
 	keg = zone_first_keg(zone);
 	KASSERT(keg->uk_pages == 0,
 	    ("uma_zone_set_fini on non-empty keg"));
 	keg->uk_fini = fini;
 	ZONE_UNLOCK(zone);
 }
 
 /* See uma.h */
 void
 uma_zone_set_zinit(uma_zone_t zone, uma_init zinit)
 {
 	ZONE_LOCK(zone);
 	KASSERT(zone_first_keg(zone)->uk_pages == 0,
 	    ("uma_zone_set_zinit on non-empty keg"));
 	zone->uz_init = zinit;
 	ZONE_UNLOCK(zone);
 }
 
 /* See uma.h */
 void
 uma_zone_set_zfini(uma_zone_t zone, uma_fini zfini)
 {
 	ZONE_LOCK(zone);
 	KASSERT(zone_first_keg(zone)->uk_pages == 0,
 	    ("uma_zone_set_zfini on non-empty keg"));
 	zone->uz_fini = zfini;
 	ZONE_UNLOCK(zone);
 }
 
 /* See uma.h */
 /* XXX uk_freef is not actually used with the zone locked */
 void
 uma_zone_set_freef(uma_zone_t zone, uma_free freef)
 {
 
 	ZONE_LOCK(zone);
 	zone_first_keg(zone)->uk_freef = freef;
 	ZONE_UNLOCK(zone);
 }
 
 /* See uma.h */
 /* XXX uk_allocf is not actually used with the zone locked */
 void
 uma_zone_set_allocf(uma_zone_t zone, uma_alloc allocf)
 {
 	uma_keg_t keg;
 
 	ZONE_LOCK(zone);
 	keg = zone_first_keg(zone);
 	keg->uk_flags |= UMA_ZFLAG_PRIVALLOC;
 	keg->uk_allocf = allocf;
 	ZONE_UNLOCK(zone);
 }
 
 /* See uma.h */
 int
-uma_zone_set_obj(uma_zone_t zone, struct vm_object *obj, int count)
+uma_zone_reserve_kva(uma_zone_t zone, int count)
 {
 	uma_keg_t keg;
 	vm_offset_t kva;
 	int pages;
 
 	keg = zone_first_keg(zone);
 	pages = count / keg->uk_ipers;
 
 	if (pages * keg->uk_ipers < count)
 		pages++;
 
-	kva = kmem_alloc_nofault(kernel_map, pages * UMA_SLAB_SIZE);
-
-	if (kva == 0)
-		return (0);
-	if (obj == NULL)
-		obj = vm_object_allocate(OBJT_PHYS, pages);
-	else {
-		VM_OBJECT_LOCK_INIT(obj, "uma object");
-		_vm_object_allocate(OBJT_PHYS, pages, obj);
-	}
+#ifdef UMA_MD_SMALL_ALLOC
+	if (keg->uk_ppera > 1) {
+#else
+	if (1) {
+#endif
+		kva = kmem_alloc_nofault(kernel_map, pages * UMA_SLAB_SIZE);
+		if (kva == 0)
+			return (0);
+	} else
+		kva = 0;
 	ZONE_LOCK(zone);
 	keg->uk_kva = kva;
-	keg->uk_obj = obj;
+	keg->uk_offset = 0;
 	keg->uk_maxpages = pages;
-	keg->uk_allocf = obj_alloc;
+#ifdef UMA_MD_SMALL_ALLOC
+	keg->uk_allocf = (keg->uk_ppera > 1) ? noobj_alloc : uma_small_alloc;
+#else
+	keg->uk_allocf = noobj_alloc;
+#endif
 	keg->uk_flags |= UMA_ZONE_NOFREE | UMA_ZFLAG_PRIVALLOC;
 	ZONE_UNLOCK(zone);
 	return (1);
 }
 
 /* See uma.h */
 void
 uma_prealloc(uma_zone_t zone, int items)
 {
 	int slabs;
 	uma_slab_t slab;
 	uma_keg_t keg;
 
 	keg = zone_first_keg(zone);
 	ZONE_LOCK(zone);
 	slabs = items / keg->uk_ipers;
 	if (slabs * keg->uk_ipers < items)
 		slabs++;
 	while (slabs > 0) {
 		slab = keg_alloc_slab(keg, zone, M_WAITOK);
 		if (slab == NULL)
 			break;
 		MPASS(slab->us_keg == keg);
 		LIST_INSERT_HEAD(&keg->uk_free_slab, slab, us_link);
 		slabs--;
 	}
 	ZONE_UNLOCK(zone);
 }
 
 /* See uma.h */
 u_int32_t *
 uma_find_refcnt(uma_zone_t zone, void *item)
 {
 	uma_slabrefcnt_t slabref;
 	uma_keg_t keg;
 	u_int32_t *refcnt;
 	int idx;
 
 	slabref = (uma_slabrefcnt_t)vtoslab((vm_offset_t)item &
 	    (~UMA_SLAB_MASK));
 	keg = slabref->us_keg;
 	KASSERT(slabref != NULL && slabref->us_keg->uk_flags & UMA_ZONE_REFCNT,
 	    ("uma_find_refcnt(): zone possibly not UMA_ZONE_REFCNT"));
 	idx = ((unsigned long)item - (unsigned long)slabref->us_data)
 	    / keg->uk_rsize;
 	refcnt = &slabref->us_freelist[idx].us_refcnt;
 	return refcnt;
 }
 
 /* See uma.h */
 void
 uma_reclaim(void)
 {
 #ifdef UMA_DEBUG
 	printf("UMA: vm asked us to release pages!\n");
 #endif
 	bucket_enable();
 	zone_foreach(zone_drain);
 	/*
 	 * Some slabs may have been freed but this zone will be visited early
 	 * we visit again so that we can free pages that are empty once other
 	 * zones are drained.  We have to do the same for buckets.
 	 */
 	zone_drain(slabzone);
 	zone_drain(slabrefzone);
 	bucket_zone_drain();
 }
 
 /* See uma.h */
 int
 uma_zone_exhausted(uma_zone_t zone)
 {
 	int full;
 
 	ZONE_LOCK(zone);
 	full = (zone->uz_flags & UMA_ZFLAG_FULL);
 	ZONE_UNLOCK(zone);
 	return (full);	
 }
 
 int
 uma_zone_exhausted_nolock(uma_zone_t zone)
 {
 	return (zone->uz_flags & UMA_ZFLAG_FULL);
 }
 
 void *
 uma_large_malloc(int size, int wait)
 {
 	void *mem;
 	uma_slab_t slab;
 	u_int8_t flags;
 
 	slab = zone_alloc_item(slabzone, NULL, wait);
 	if (slab == NULL)
 		return (NULL);
 	mem = page_alloc(NULL, size, &flags, wait);
 	if (mem) {
 		vsetslab((vm_offset_t)mem, slab);
 		slab->us_data = mem;
 		slab->us_flags = flags | UMA_SLAB_MALLOC;
 		slab->us_size = size;
 	} else {
 		zone_free_item(slabzone, slab, NULL, SKIP_NONE,
 		    ZFREE_STATFAIL | ZFREE_STATFREE);
 	}
 
 	return (mem);
 }
 
 void
 uma_large_free(uma_slab_t slab)
 {
 	vsetobj((vm_offset_t)slab->us_data, kmem_object);
 	page_free(slab->us_data, slab->us_size, slab->us_flags);
 	zone_free_item(slabzone, slab, NULL, SKIP_NONE, ZFREE_STATFREE);
 }
 
 void
 uma_print_stats(void)
 {
 	zone_foreach(uma_print_zone);
 }
 
 static void
 slab_print(uma_slab_t slab)
 {
 	printf("slab: keg %p, data %p, freecount %d, firstfree %d\n",
 		slab->us_keg, slab->us_data, slab->us_freecount,
 		slab->us_firstfree);
 }
 
 static void
 cache_print(uma_cache_t cache)
 {
 	printf("alloc: %p(%d), free: %p(%d)\n",
 		cache->uc_allocbucket,
 		cache->uc_allocbucket?cache->uc_allocbucket->ub_cnt:0,
 		cache->uc_freebucket,
 		cache->uc_freebucket?cache->uc_freebucket->ub_cnt:0);
 }
 
 static void
 uma_print_keg(uma_keg_t keg)
 {
 	uma_slab_t slab;
 
 	printf("keg: %s(%p) size %d(%d) flags %#x ipers %d ppera %d "
 	    "out %d free %d limit %d\n",
 	    keg->uk_name, keg, keg->uk_size, keg->uk_rsize, keg->uk_flags,
 	    keg->uk_ipers, keg->uk_ppera,
 	    (keg->uk_ipers * keg->uk_pages) - keg->uk_free, keg->uk_free,
 	    (keg->uk_maxpages / keg->uk_ppera) * keg->uk_ipers);
 	printf("Part slabs:\n");
 	LIST_FOREACH(slab, &keg->uk_part_slab, us_link)
 		slab_print(slab);
 	printf("Free slabs:\n");
 	LIST_FOREACH(slab, &keg->uk_free_slab, us_link)
 		slab_print(slab);
 	printf("Full slabs:\n");
 	LIST_FOREACH(slab, &keg->uk_full_slab, us_link)
 		slab_print(slab);
 }
 
 void
 uma_print_zone(uma_zone_t zone)
 {
 	uma_cache_t cache;
 	uma_klink_t kl;
 	int i;
 
 	printf("zone: %s(%p) size %d flags %#x\n",
 	    zone->uz_name, zone, zone->uz_size, zone->uz_flags);
 	LIST_FOREACH(kl, &zone->uz_kegs, kl_link)
 		uma_print_keg(kl->kl_keg);
 	CPU_FOREACH(i) {
 		cache = &zone->uz_cpu[i];
 		printf("CPU %d Cache:\n", i);
 		cache_print(cache);
 	}
 }
 
 #ifdef DDB
 /*
  * Generate statistics across both the zone and its per-cpu cache's.  Return
  * desired statistics if the pointer is non-NULL for that statistic.
  *
  * Note: does not update the zone statistics, as it can't safely clear the
  * per-CPU cache statistic.
  *
  * XXXRW: Following the uc_allocbucket and uc_freebucket pointers here isn't
  * safe from off-CPU; we should modify the caches to track this information
  * directly so that we don't have to.
  */
 static void
 uma_zone_sumstat(uma_zone_t z, int *cachefreep, u_int64_t *allocsp,
     u_int64_t *freesp, u_int64_t *sleepsp)
 {
 	uma_cache_t cache;
 	u_int64_t allocs, frees, sleeps;
 	int cachefree, cpu;
 
 	allocs = frees = sleeps = 0;
 	cachefree = 0;
 	CPU_FOREACH(cpu) {
 		cache = &z->uz_cpu[cpu];
 		if (cache->uc_allocbucket != NULL)
 			cachefree += cache->uc_allocbucket->ub_cnt;
 		if (cache->uc_freebucket != NULL)
 			cachefree += cache->uc_freebucket->ub_cnt;
 		allocs += cache->uc_allocs;
 		frees += cache->uc_frees;
 	}
 	allocs += z->uz_allocs;
 	frees += z->uz_frees;
 	sleeps += z->uz_sleeps;
 	if (cachefreep != NULL)
 		*cachefreep = cachefree;
 	if (allocsp != NULL)
 		*allocsp = allocs;
 	if (freesp != NULL)
 		*freesp = frees;
 	if (sleepsp != NULL)
 		*sleepsp = sleeps;
 }
 #endif /* DDB */
 
 static int
 sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS)
 {
 	uma_keg_t kz;
 	uma_zone_t z;
 	int count;
 
 	count = 0;
 	mtx_lock(&uma_mtx);
 	LIST_FOREACH(kz, &uma_kegs, uk_link) {
 		LIST_FOREACH(z, &kz->uk_zones, uz_link)
 			count++;
 	}
 	mtx_unlock(&uma_mtx);
 	return (sysctl_handle_int(oidp, &count, 0, req));
 }
 
 static int
 sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS)
 {
 	struct uma_stream_header ush;
 	struct uma_type_header uth;
 	struct uma_percpu_stat ups;
 	uma_bucket_t bucket;
 	struct sbuf sbuf;
 	uma_cache_t cache;
 	uma_klink_t kl;
 	uma_keg_t kz;
 	uma_zone_t z;
 	uma_keg_t k;
 	int count, error, i;
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
 
 	count = 0;
 	mtx_lock(&uma_mtx);
 	LIST_FOREACH(kz, &uma_kegs, uk_link) {
 		LIST_FOREACH(z, &kz->uk_zones, uz_link)
 			count++;
 	}
 
 	/*
 	 * Insert stream header.
 	 */
 	bzero(&ush, sizeof(ush));
 	ush.ush_version = UMA_STREAM_VERSION;
 	ush.ush_maxcpus = (mp_maxid + 1);
 	ush.ush_count = count;
 	(void)sbuf_bcat(&sbuf, &ush, sizeof(ush));
 
 	LIST_FOREACH(kz, &uma_kegs, uk_link) {
 		LIST_FOREACH(z, &kz->uk_zones, uz_link) {
 			bzero(&uth, sizeof(uth));
 			ZONE_LOCK(z);
 			strlcpy(uth.uth_name, z->uz_name, UTH_MAX_NAME);
 			uth.uth_align = kz->uk_align;
 			uth.uth_size = kz->uk_size;
 			uth.uth_rsize = kz->uk_rsize;
 			LIST_FOREACH(kl, &z->uz_kegs, kl_link) {
 				k = kl->kl_keg;
 				uth.uth_maxpages += k->uk_maxpages;
 				uth.uth_pages += k->uk_pages;
 				uth.uth_keg_free += k->uk_free;
 				uth.uth_limit = (k->uk_maxpages / k->uk_ppera)
 				    * k->uk_ipers;
 			}
 
 			/*
 			 * A zone is secondary is it is not the first entry
 			 * on the keg's zone list.
 			 */
 			if ((z->uz_flags & UMA_ZONE_SECONDARY) &&
 			    (LIST_FIRST(&kz->uk_zones) != z))
 				uth.uth_zone_flags = UTH_ZONE_SECONDARY;
 
 			LIST_FOREACH(bucket, &z->uz_full_bucket, ub_link)
 				uth.uth_zone_free += bucket->ub_cnt;
 			uth.uth_allocs = z->uz_allocs;
 			uth.uth_frees = z->uz_frees;
 			uth.uth_fails = z->uz_fails;
 			uth.uth_sleeps = z->uz_sleeps;
 			(void)sbuf_bcat(&sbuf, &uth, sizeof(uth));
 			/*
 			 * While it is not normally safe to access the cache
 			 * bucket pointers while not on the CPU that owns the
 			 * cache, we only allow the pointers to be exchanged
 			 * without the zone lock held, not invalidated, so
 			 * accept the possible race associated with bucket
 			 * exchange during monitoring.
 			 */
 			for (i = 0; i < (mp_maxid + 1); i++) {
 				bzero(&ups, sizeof(ups));
 				if (kz->uk_flags & UMA_ZFLAG_INTERNAL)
 					goto skip;
 				if (CPU_ABSENT(i))
 					goto skip;
 				cache = &z->uz_cpu[i];
 				if (cache->uc_allocbucket != NULL)
 					ups.ups_cache_free +=
 					    cache->uc_allocbucket->ub_cnt;
 				if (cache->uc_freebucket != NULL)
 					ups.ups_cache_free +=
 					    cache->uc_freebucket->ub_cnt;
 				ups.ups_allocs = cache->uc_allocs;
 				ups.ups_frees = cache->uc_frees;
 skip:
 				(void)sbuf_bcat(&sbuf, &ups, sizeof(ups));
 			}
 			ZONE_UNLOCK(z);
 		}
 	}
 	mtx_unlock(&uma_mtx);
 	error = sbuf_finish(&sbuf);
 	sbuf_delete(&sbuf);
 	return (error);
 }
 
 #ifdef DDB
 DB_SHOW_COMMAND(uma, db_show_uma)
 {
 	u_int64_t allocs, frees, sleeps;
 	uma_bucket_t bucket;
 	uma_keg_t kz;
 	uma_zone_t z;
 	int cachefree;
 
 	db_printf("%18s %8s %8s %8s %12s %8s\n", "Zone", "Size", "Used", "Free",
 	    "Requests", "Sleeps");
 	LIST_FOREACH(kz, &uma_kegs, uk_link) {
 		LIST_FOREACH(z, &kz->uk_zones, uz_link) {
 			if (kz->uk_flags & UMA_ZFLAG_INTERNAL) {
 				allocs = z->uz_allocs;
 				frees = z->uz_frees;
 				sleeps = z->uz_sleeps;
 				cachefree = 0;
 			} else
 				uma_zone_sumstat(z, &cachefree, &allocs,
 				    &frees, &sleeps);
 			if (!((z->uz_flags & UMA_ZONE_SECONDARY) &&
 			    (LIST_FIRST(&kz->uk_zones) != z)))
 				cachefree += kz->uk_free;
 			LIST_FOREACH(bucket, &z->uz_full_bucket, ub_link)
 				cachefree += bucket->ub_cnt;
 			db_printf("%18s %8ju %8jd %8d %12ju %8ju\n", z->uz_name,
 			    (uintmax_t)kz->uk_size,
 			    (intmax_t)(allocs - frees), cachefree,
 			    (uintmax_t)allocs, sleeps);
 			if (db_pager_quit)
 				return;
 		}
 	}
 }
 #endif
Index: user/attilio/vmc-playground/sys/vm/uma_int.h
===================================================================
--- user/attilio/vmc-playground/sys/vm/uma_int.h	(revision 247223)
+++ user/attilio/vmc-playground/sys/vm/uma_int.h	(revision 247224)
@@ -1,455 +1,455 @@
 /*-
  * Copyright (c) 2002-2005, 2009 Jeffrey Roberson <jeff@FreeBSD.org>
  * Copyright (c) 2004, 2005 Bosko Milekic <bmilekic@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * $FreeBSD$
  *
  */
 
 /* 
  * This file includes definitions, structures, prototypes, and inlines that
  * should not be used outside of the actual implementation of UMA.
  */
 
 /* 
  * Here's a quick description of the relationship between the objects:
  *
  * Kegs contain lists of slabs which are stored in either the full bin, empty
  * bin, or partially allocated bin, to reduce fragmentation.  They also contain
  * the user supplied value for size, which is adjusted for alignment purposes
  * and rsize is the result of that.  The Keg also stores information for
  * managing a hash of page addresses that maps pages to uma_slab_t structures
  * for pages that don't have embedded uma_slab_t's.
  *  
  * The uma_slab_t may be embedded in a UMA_SLAB_SIZE chunk of memory or it may
  * be allocated off the page from a special slab zone.  The free list within a
  * slab is managed with a linked list of indices, which are 8 bit values.  If
  * UMA_SLAB_SIZE is defined to be too large I will have to switch to 16bit
  * values.  Currently on alpha you can get 250 or so 32 byte items and on x86
  * you can get 250 or so 16byte items.  For item sizes that would yield more
  * than 10% memory waste we potentially allocate a separate uma_slab_t if this
  * will improve the number of items per slab that will fit.  
  *
  * Other potential space optimizations are storing the 8bit of linkage in space
  * wasted between items due to alignment problems.  This may yield a much better
  * memory footprint for certain sizes of objects.  Another alternative is to
  * increase the UMA_SLAB_SIZE, or allow for dynamic slab sizes.  I prefer
  * dynamic slab sizes because we could stick with 8 bit indices and only use
  * large slab sizes for zones with a lot of waste per slab.  This may create
  * inefficiencies in the vm subsystem due to fragmentation in the address space.
  *
  * The only really gross cases, with regards to memory waste, are for those
  * items that are just over half the page size.   You can get nearly 50% waste,
  * so you fall back to the memory footprint of the power of two allocator. I
  * have looked at memory allocation sizes on many of the machines available to
  * me, and there does not seem to be an abundance of allocations at this range
  * so at this time it may not make sense to optimize for it.  This can, of 
  * course, be solved with dynamic slab sizes.
  *
  * Kegs may serve multiple Zones but by far most of the time they only serve
  * one.  When a Zone is created, a Keg is allocated and setup for it.  While
  * the backing Keg stores slabs, the Zone caches Buckets of items allocated
  * from the slabs.  Each Zone is equipped with an init/fini and ctor/dtor
  * pair, as well as with its own set of small per-CPU caches, layered above
  * the Zone's general Bucket cache.
  *
  * The PCPU caches are protected by critical sections, and may be accessed
  * safely only from their associated CPU, while the Zones backed by the same
  * Keg all share a common Keg lock (to coalesce contention on the backing
  * slabs).  The backing Keg typically only serves one Zone but in the case of
  * multiple Zones, one of the Zones is considered the Master Zone and all
  * Zone-related stats from the Keg are done in the Master Zone.  For an
  * example of a Multi-Zone setup, refer to the Mbuf allocation code.
  */
 
 /*
  *	This is the representation for normal (Non OFFPAGE slab)
  *
  *	i == item
  *	s == slab pointer
  *
  *	<----------------  Page (UMA_SLAB_SIZE) ------------------>
  *	___________________________________________________________
  *     | _  _  _  _  _  _  _  _  _  _  _  _  _  _  _   ___________ |
  *     ||i||i||i||i||i||i||i||i||i||i||i||i||i||i||i| |slab header||
  *     ||_||_||_||_||_||_||_||_||_||_||_||_||_||_||_| |___________|| 
  *     |___________________________________________________________|
  *
  *
  *	This is an OFFPAGE slab. These can be larger than UMA_SLAB_SIZE.
  *
  *	___________________________________________________________
  *     | _  _  _  _  _  _  _  _  _  _  _  _  _  _  _  _  _  _  _   |
  *     ||i||i||i||i||i||i||i||i||i||i||i||i||i||i||i||i||i||i||i|  |
  *     ||_||_||_||_||_||_||_||_||_||_||_||_||_||_||_||_||_||_||_|  |
  *     |___________________________________________________________|
  *       ___________    ^
  *	|slab header|   |
  *	|___________|---*
  *
  */
 
 #ifndef VM_UMA_INT_H
 #define VM_UMA_INT_H
 
 #define UMA_SLAB_SIZE	PAGE_SIZE	/* How big are our slabs? */
 #define UMA_SLAB_MASK	(PAGE_SIZE - 1)	/* Mask to get back to the page */
 #define UMA_SLAB_SHIFT	PAGE_SHIFT	/* Number of bits PAGE_MASK */
 
 #define UMA_BOOT_PAGES		64	/* Pages allocated for startup */
 
 /* Max waste before going to off page slab management */
 #define UMA_MAX_WASTE	(UMA_SLAB_SIZE / 10)
 
 /*
  * I doubt there will be many cases where this is exceeded. This is the initial
  * size of the hash table for uma_slabs that are managed off page. This hash
  * does expand by powers of two.  Currently it doesn't get smaller.
  */
 #define UMA_HASH_SIZE_INIT	32		
 
 /* 
  * I should investigate other hashing algorithms.  This should yield a low
  * number of collisions if the pages are relatively contiguous.
  *
  * This is the same algorithm that most processor caches use.
  *
  * I'm shifting and masking instead of % because it should be faster.
  */
 
 #define UMA_HASH(h, s) ((((unsigned long)s) >> UMA_SLAB_SHIFT) &	\
     (h)->uh_hashmask)
 
 #define UMA_HASH_INSERT(h, s, mem)					\
 		SLIST_INSERT_HEAD(&(h)->uh_slab_hash[UMA_HASH((h),	\
 		    (mem))], (s), us_hlink)
 #define UMA_HASH_REMOVE(h, s, mem)					\
 		SLIST_REMOVE(&(h)->uh_slab_hash[UMA_HASH((h),		\
 		    (mem))], (s), uma_slab, us_hlink)
 
 /* Hash table for freed address -> slab translation */
 
 SLIST_HEAD(slabhead, uma_slab);
 
 struct uma_hash {
 	struct slabhead	*uh_slab_hash;	/* Hash table for slabs */
 	int		uh_hashsize;	/* Current size of the hash table */
 	int		uh_hashmask;	/* Mask used during hashing */
 };
 
 /*
  * align field or structure to cache line
  */
 #if defined(__amd64__)
 #define UMA_ALIGN	__aligned(CACHE_LINE_SIZE)
 #else
 #define UMA_ALIGN
 #endif
 
 /*
  * Structures for per cpu queues.
  */
 
 struct uma_bucket {
 	LIST_ENTRY(uma_bucket)	ub_link;	/* Link into the zone */
 	int16_t	ub_cnt;				/* Count of free items. */
 	int16_t	ub_entries;			/* Max items. */
 	void	*ub_bucket[];			/* actual allocation storage */
 };
 
 typedef struct uma_bucket * uma_bucket_t;
 
 struct uma_cache {
 	uma_bucket_t	uc_freebucket;	/* Bucket we're freeing to */
 	uma_bucket_t	uc_allocbucket;	/* Bucket to allocate from */
 	u_int64_t	uc_allocs;	/* Count of allocations */
 	u_int64_t	uc_frees;	/* Count of frees */
 } UMA_ALIGN;
 
 typedef struct uma_cache * uma_cache_t;
 
 /*
  * Keg management structure
  *
  * TODO: Optimize for cache line size
  *
  */
 struct uma_keg {
 	LIST_ENTRY(uma_keg)	uk_link;	/* List of all kegs */
 
 	struct mtx	uk_lock;	/* Lock for the keg */
 	struct uma_hash	uk_hash;
 
 	const char	*uk_name;		/* Name of creating zone. */
 	LIST_HEAD(,uma_zone)	uk_zones;	/* Keg's zones */
 	LIST_HEAD(,uma_slab)	uk_part_slab;	/* partially allocated slabs */
 	LIST_HEAD(,uma_slab)	uk_free_slab;	/* empty slab list */
 	LIST_HEAD(,uma_slab)	uk_full_slab;	/* full slabs */
 
 	u_int32_t	uk_recurse;	/* Allocation recursion count */
 	u_int32_t	uk_align;	/* Alignment mask */
 	u_int32_t	uk_pages;	/* Total page count */
 	u_int32_t	uk_free;	/* Count of items free in slabs */
 	u_int32_t	uk_size;	/* Requested size of each item */
 	u_int32_t	uk_rsize;	/* Real size of each item */
 	u_int32_t	uk_maxpages;	/* Maximum number of pages to alloc */
 
 	uma_init	uk_init;	/* Keg's init routine */
 	uma_fini	uk_fini;	/* Keg's fini routine */
 	uma_alloc	uk_allocf;	/* Allocation function */
 	uma_free	uk_freef;	/* Free routine */
 
-	struct vm_object	*uk_obj;	/* Zone specific object */
-	vm_offset_t	uk_kva;		/* Base kva for zones with objs */
+	u_long		uk_offset;	/* Zone specific next page index */
+	vm_offset_t	uk_kva;		/* Zone base kva */
 	uma_zone_t	uk_slabzone;	/* Slab zone backing us, if OFFPAGE */
 
 	u_int16_t	uk_pgoff;	/* Offset to uma_slab struct */
 	u_int16_t	uk_ppera;	/* pages per allocation from backend */
 	u_int16_t	uk_ipers;	/* Items per slab */
 	u_int32_t	uk_flags;	/* Internal flags */
 };
 typedef struct uma_keg	* uma_keg_t;
 
 /* Page management structure */
 
 /* Sorry for the union, but space efficiency is important */
 struct uma_slab_head {
 	uma_keg_t	us_keg;			/* Keg we live in */
 	union {
 		LIST_ENTRY(uma_slab)	_us_link;	/* slabs in zone */
 		unsigned long	_us_size;	/* Size of allocation */
 	} us_type;
 	SLIST_ENTRY(uma_slab)	us_hlink;	/* Link for hash table */
 	u_int8_t	*us_data;		/* First item */
 	u_int8_t	us_flags;		/* Page flags see uma.h */
 	u_int8_t	us_freecount;	/* How many are free? */
 	u_int8_t	us_firstfree;	/* First free item index */
 };
 
 /* The standard slab structure */
 struct uma_slab {
 	struct uma_slab_head	us_head;	/* slab header data */
 	struct {
 		u_int8_t	us_item;
 	} us_freelist[1];			/* actual number bigger */
 };
 
 /*
  * The slab structure for UMA_ZONE_REFCNT zones for whose items we
  * maintain reference counters in the slab for.
  */
 struct uma_slab_refcnt {
 	struct uma_slab_head	us_head;	/* slab header data */
 	struct {
 		u_int8_t	us_item;
 		u_int32_t	us_refcnt;
 	} us_freelist[1];			/* actual number bigger */
 };
 
 #define	us_keg		us_head.us_keg
 #define	us_link		us_head.us_type._us_link
 #define	us_size		us_head.us_type._us_size
 #define	us_hlink	us_head.us_hlink
 #define	us_data		us_head.us_data
 #define	us_flags	us_head.us_flags
 #define	us_freecount	us_head.us_freecount
 #define	us_firstfree	us_head.us_firstfree
 
 typedef struct uma_slab * uma_slab_t;
 typedef struct uma_slab_refcnt * uma_slabrefcnt_t;
 typedef uma_slab_t (*uma_slaballoc)(uma_zone_t, uma_keg_t, int);
 
 
 /*
  * These give us the size of one free item reference within our corresponding
  * uma_slab structures, so that our calculations during zone setup are correct
  * regardless of what the compiler decides to do with padding the structure
  * arrays within uma_slab.
  */
 #define	UMA_FRITM_SZ	(sizeof(struct uma_slab) - sizeof(struct uma_slab_head))
 #define	UMA_FRITMREF_SZ	(sizeof(struct uma_slab_refcnt) -	\
     sizeof(struct uma_slab_head))
 
 struct uma_klink {
 	LIST_ENTRY(uma_klink)	kl_link;
 	uma_keg_t		kl_keg;
 };
 typedef struct uma_klink *uma_klink_t;
 
 /*
  * Zone management structure 
  *
  * TODO: Optimize for cache line size
  *
  */
 struct uma_zone {
 	const char	*uz_name;	/* Text name of the zone */
 	struct mtx	*uz_lock;	/* Lock for the zone (keg's lock) */
 
 	LIST_ENTRY(uma_zone)	uz_link;	/* List of all zones in keg */
 	LIST_HEAD(,uma_bucket)	uz_full_bucket;	/* full buckets */
 	LIST_HEAD(,uma_bucket)	uz_free_bucket;	/* Buckets for frees */
 
 	LIST_HEAD(,uma_klink)	uz_kegs;	/* List of kegs. */
 	struct uma_klink	uz_klink;	/* klink for first keg. */
 
 	uma_slaballoc	uz_slab;	/* Allocate a slab from the backend. */
 	uma_ctor	uz_ctor;	/* Constructor for each allocation */
 	uma_dtor	uz_dtor;	/* Destructor */
 	uma_init	uz_init;	/* Initializer for each item */
 	uma_fini	uz_fini;	/* Discards memory */
 
 	u_int32_t	uz_flags;	/* Flags inherited from kegs */
 	u_int32_t	uz_size;	/* Size inherited from kegs */
 
 	u_int64_t	uz_allocs UMA_ALIGN; /* Total number of allocations */
 	u_int64_t	uz_frees;	/* Total number of frees */
 	u_int64_t	uz_fails;	/* Total number of alloc failures */
 	u_int64_t	uz_sleeps;	/* Total number of alloc sleeps */
 	uint16_t	uz_fills;	/* Outstanding bucket fills */
 	uint16_t	uz_count;	/* Highest amount of items in bucket */
 
 	/* The next three fields are used to print a rate-limited warnings. */
 	const char	*uz_warning;	/* Warning to print on failure */
 	struct timeval	uz_ratecheck;	/* Warnings rate-limiting */
 
 	/*
 	 * This HAS to be the last item because we adjust the zone size
 	 * based on NCPU and then allocate the space for the zones.
 	 */
 	struct uma_cache	uz_cpu[1]; /* Per cpu caches */
 };
 
 /*
  * These flags must not overlap with the UMA_ZONE flags specified in uma.h.
  */
 #define	UMA_ZFLAG_BUCKET	0x02000000	/* Bucket zone. */
 #define	UMA_ZFLAG_MULTI		0x04000000	/* Multiple kegs in the zone. */
 #define	UMA_ZFLAG_DRAINING	0x08000000	/* Running zone_drain. */
 #define UMA_ZFLAG_PRIVALLOC	0x10000000	/* Use uz_allocf. */
 #define UMA_ZFLAG_INTERNAL	0x20000000	/* No offpage no PCPU. */
 #define UMA_ZFLAG_FULL		0x40000000	/* Reached uz_maxpages */
 #define UMA_ZFLAG_CACHEONLY	0x80000000	/* Don't ask VM for buckets. */
 
 #define	UMA_ZFLAG_INHERIT	(UMA_ZFLAG_INTERNAL | UMA_ZFLAG_CACHEONLY | \
 				    UMA_ZFLAG_BUCKET)
 
 #undef UMA_ALIGN
 
 #ifdef _KERNEL
 /* Internal prototypes */
 static __inline uma_slab_t hash_sfind(struct uma_hash *hash, u_int8_t *data);
 void *uma_large_malloc(int size, int wait);
 void uma_large_free(uma_slab_t slab);
 
 /* Lock Macros */
 
 #define	KEG_LOCK_INIT(k, lc)					\
 	do {							\
 		if ((lc))					\
 			mtx_init(&(k)->uk_lock, (k)->uk_name,	\
 			    (k)->uk_name, MTX_DEF | MTX_DUPOK);	\
 		else						\
 			mtx_init(&(k)->uk_lock, (k)->uk_name,	\
 			    "UMA zone", MTX_DEF | MTX_DUPOK);	\
 	} while (0)
 	    
 #define	KEG_LOCK_FINI(k)	mtx_destroy(&(k)->uk_lock)
 #define	KEG_LOCK(k)	mtx_lock(&(k)->uk_lock)
 #define	KEG_UNLOCK(k)	mtx_unlock(&(k)->uk_lock)
 #define	ZONE_LOCK(z)	mtx_lock((z)->uz_lock)
 #define ZONE_UNLOCK(z)	mtx_unlock((z)->uz_lock)
 
 /*
  * Find a slab within a hash table.  This is used for OFFPAGE zones to lookup
  * the slab structure.
  *
  * Arguments:
  *	hash  The hash table to search.
  *	data  The base page of the item.
  *
  * Returns:
  *	A pointer to a slab if successful, else NULL.
  */
 static __inline uma_slab_t
 hash_sfind(struct uma_hash *hash, u_int8_t *data)
 {
         uma_slab_t slab;
         int hval;
 
         hval = UMA_HASH(hash, data);
 
         SLIST_FOREACH(slab, &hash->uh_slab_hash[hval], us_hlink) {
                 if ((u_int8_t *)slab->us_data == data)
                         return (slab);
         }
         return (NULL);
 }
 
 static __inline uma_slab_t
 vtoslab(vm_offset_t va)
 {
 	vm_page_t p;
 	uma_slab_t slab;
 
 	p = PHYS_TO_VM_PAGE(pmap_kextract(va));
 	slab = (uma_slab_t )p->object;
 
 	if (p->flags & PG_SLAB)
 		return (slab);
 	else
 		return (NULL);
 }
 
 static __inline void
 vsetslab(vm_offset_t va, uma_slab_t slab)
 {
 	vm_page_t p;
 
 	p = PHYS_TO_VM_PAGE(pmap_kextract(va));
 	p->object = (vm_object_t)slab;
 	p->flags |= PG_SLAB;
 }
 
 static __inline void
 vsetobj(vm_offset_t va, vm_object_t obj)
 {
 	vm_page_t p;
 
 	p = PHYS_TO_VM_PAGE(pmap_kextract(va));
 	p->object = obj;
 	p->flags &= ~PG_SLAB;
 }
 
 /*
  * The following two functions may be defined by architecture specific code
  * if they can provide more effecient allocation functions.  This is useful
  * for using direct mapped addresses.
  */
 void *uma_small_alloc(uma_zone_t zone, int bytes, u_int8_t *pflag, int wait);
 void uma_small_free(void *mem, int size, u_int8_t flags);
 #endif /* _KERNEL */
 
 #endif /* VM_UMA_INT_H */
Index: user/attilio/vmc-playground/sys/vm/vm_map.c
===================================================================
--- user/attilio/vmc-playground/sys/vm/vm_map.c	(revision 247223)
+++ user/attilio/vmc-playground/sys/vm/vm_map.c	(revision 247224)
@@ -1,4095 +1,4094 @@
 /*-
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * The Mach Operating System project at Carnegie-Mellon University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)vm_map.c	8.3 (Berkeley) 1/12/94
  *
  *
  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  * All rights reserved.
  *
  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
  *
  * Permission to use, copy, modify and distribute this software and
  * its documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  *
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  */
 
 /*
  *	Virtual memory mapping module.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/vmmeter.h>
 #include <sys/mman.h>
 #include <sys/vnode.h>
 #include <sys/racct.h>
 #include <sys/resourcevar.h>
 #include <sys/file.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/shm.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_object.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 #include <vm/vnode_pager.h>
 #include <vm/swap_pager.h>
 #include <vm/uma.h>
 
 /*
  *	Virtual memory maps provide for the mapping, protection,
  *	and sharing of virtual memory objects.  In addition,
  *	this module provides for an efficient virtual copy of
  *	memory from one map to another.
  *
  *	Synchronization is required prior to most operations.
  *
  *	Maps consist of an ordered doubly-linked list of simple
  *	entries; a self-adjusting binary search tree of these
  *	entries is used to speed up lookups.
  *
  *	Since portions of maps are specified by start/end addresses,
  *	which may not align with existing map entries, all
  *	routines merely "clip" entries to these start/end values.
  *	[That is, an entry is split into two, bordering at a
  *	start or end value.]  Note that these clippings may not
  *	always be necessary (as the two resulting entries are then
  *	not changed); however, the clipping is done for convenience.
  *
  *	As mentioned above, virtual copy operations are performed
  *	by copying VM object references from one map to
  *	another, and then marking both regions as copy-on-write.
  */
 
 static struct mtx map_sleep_mtx;
 static uma_zone_t mapentzone;
 static uma_zone_t kmapentzone;
 static uma_zone_t mapzone;
 static uma_zone_t vmspace_zone;
-static struct vm_object kmapentobj;
 static int vmspace_zinit(void *mem, int size, int flags);
 static void vmspace_zfini(void *mem, int size);
 static int vm_map_zinit(void *mem, int ize, int flags);
 static void vm_map_zfini(void *mem, int size);
 static void _vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min,
     vm_offset_t max);
 static void vm_map_entry_deallocate(vm_map_entry_t entry, boolean_t system_map);
 static void vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry);
 #ifdef INVARIANTS
 static void vm_map_zdtor(void *mem, int size, void *arg);
 static void vmspace_zdtor(void *mem, int size, void *arg);
 #endif
 
 #define	ENTRY_CHARGED(e) ((e)->cred != NULL || \
     ((e)->object.vm_object != NULL && (e)->object.vm_object->cred != NULL && \
      !((e)->eflags & MAP_ENTRY_NEEDS_COPY)))
 
 /* 
  * PROC_VMSPACE_{UN,}LOCK() can be a noop as long as vmspaces are type
  * stable.
  */
 #define PROC_VMSPACE_LOCK(p) do { } while (0)
 #define PROC_VMSPACE_UNLOCK(p) do { } while (0)
 
 /*
  *	VM_MAP_RANGE_CHECK:	[ internal use only ]
  *
  *	Asserts that the starting and ending region
  *	addresses fall within the valid range of the map.
  */
 #define	VM_MAP_RANGE_CHECK(map, start, end)		\
 		{					\
 		if (start < vm_map_min(map))		\
 			start = vm_map_min(map);	\
 		if (end > vm_map_max(map))		\
 			end = vm_map_max(map);		\
 		if (start > end)			\
 			start = end;			\
 		}
 
 /*
  *	vm_map_startup:
  *
  *	Initialize the vm_map module.  Must be called before
  *	any other vm_map routines.
  *
  *	Map and entry structures are allocated from the general
  *	purpose memory pool with some exceptions:
  *
  *	- The kernel map and kmem submap are allocated statically.
  *	- Kernel map entries are allocated out of a static pool.
  *
  *	These restrictions are necessary since malloc() uses the
  *	maps and requires map entries.
  */
 
 void
 vm_map_startup(void)
 {
 	mtx_init(&map_sleep_mtx, "vm map sleep mutex", NULL, MTX_DEF);
 	mapzone = uma_zcreate("MAP", sizeof(struct vm_map), NULL,
 #ifdef INVARIANTS
 	    vm_map_zdtor,
 #else
 	    NULL,
 #endif
 	    vm_map_zinit, vm_map_zfini, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	uma_prealloc(mapzone, MAX_KMAP);
 	kmapentzone = uma_zcreate("KMAP ENTRY", sizeof(struct vm_map_entry),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
 	    UMA_ZONE_MTXCLASS | UMA_ZONE_VM);
 	uma_prealloc(kmapentzone, MAX_KMAPENT);
 	mapentzone = uma_zcreate("MAP ENTRY", sizeof(struct vm_map_entry),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 }
 
 static void
 vmspace_zfini(void *mem, int size)
 {
 	struct vmspace *vm;
 
 	vm = (struct vmspace *)mem;
 	vm_map_zfini(&vm->vm_map, sizeof(vm->vm_map));
 }
 
 static int
 vmspace_zinit(void *mem, int size, int flags)
 {
 	struct vmspace *vm;
 
 	vm = (struct vmspace *)mem;
 
 	vm->vm_map.pmap = NULL;
 	(void)vm_map_zinit(&vm->vm_map, sizeof(vm->vm_map), flags);
 	return (0);
 }
 
 static void
 vm_map_zfini(void *mem, int size)
 {
 	vm_map_t map;
 
 	map = (vm_map_t)mem;
 	mtx_destroy(&map->system_mtx);
 	sx_destroy(&map->lock);
 }
 
 static int
 vm_map_zinit(void *mem, int size, int flags)
 {
 	vm_map_t map;
 
 	map = (vm_map_t)mem;
 	map->nentries = 0;
 	map->size = 0;
 	mtx_init(&map->system_mtx, "vm map (system)", NULL, MTX_DEF | MTX_DUPOK);
 	sx_init(&map->lock, "vm map (user)");
 	return (0);
 }
 
 #ifdef INVARIANTS
 static void
 vmspace_zdtor(void *mem, int size, void *arg)
 {
 	struct vmspace *vm;
 
 	vm = (struct vmspace *)mem;
 
 	vm_map_zdtor(&vm->vm_map, sizeof(vm->vm_map), arg);
 }
 static void
 vm_map_zdtor(void *mem, int size, void *arg)
 {
 	vm_map_t map;
 
 	map = (vm_map_t)mem;
 	KASSERT(map->nentries == 0,
 	    ("map %p nentries == %d on free.",
 	    map, map->nentries));
 	KASSERT(map->size == 0,
 	    ("map %p size == %lu on free.",
 	    map, (unsigned long)map->size));
 }
 #endif	/* INVARIANTS */
 
 /*
  * Allocate a vmspace structure, including a vm_map and pmap,
  * and initialize those structures.  The refcnt is set to 1.
  */
 struct vmspace *
 vmspace_alloc(min, max)
 	vm_offset_t min, max;
 {
 	struct vmspace *vm;
 
 	vm = uma_zalloc(vmspace_zone, M_WAITOK);
 	if (vm->vm_map.pmap == NULL && !pmap_pinit(vmspace_pmap(vm))) {
 		uma_zfree(vmspace_zone, vm);
 		return (NULL);
 	}
 	CTR1(KTR_VM, "vmspace_alloc: %p", vm);
 	_vm_map_init(&vm->vm_map, vmspace_pmap(vm), min, max);
 	vm->vm_refcnt = 1;
 	vm->vm_shm = NULL;
 	vm->vm_swrss = 0;
 	vm->vm_tsize = 0;
 	vm->vm_dsize = 0;
 	vm->vm_ssize = 0;
 	vm->vm_taddr = 0;
 	vm->vm_daddr = 0;
 	vm->vm_maxsaddr = 0;
 	return (vm);
 }
 
 void
 vm_init2(void)
 {
-	uma_zone_set_obj(kmapentzone, &kmapentobj, lmin(cnt.v_page_count,
+	uma_zone_reserve_kva(kmapentzone, lmin(cnt.v_page_count,
 	    (VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS) / PAGE_SIZE) / 8 +
 	     maxproc * 2 + maxfiles);
 	vmspace_zone = uma_zcreate("VMSPACE", sizeof(struct vmspace), NULL,
 #ifdef INVARIANTS
 	    vmspace_zdtor,
 #else
 	    NULL,
 #endif
 	    vmspace_zinit, vmspace_zfini, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 }
 
 static void
 vmspace_container_reset(struct proc *p)
 {
 
 #ifdef RACCT
 	PROC_LOCK(p);
 	racct_set(p, RACCT_DATA, 0);
 	racct_set(p, RACCT_STACK, 0);
 	racct_set(p, RACCT_RSS, 0);
 	racct_set(p, RACCT_MEMLOCK, 0);
 	racct_set(p, RACCT_VMEM, 0);
 	PROC_UNLOCK(p);
 #endif
 }
 
 static inline void
 vmspace_dofree(struct vmspace *vm)
 {
 
 	CTR1(KTR_VM, "vmspace_free: %p", vm);
 
 	/*
 	 * Make sure any SysV shm is freed, it might not have been in
 	 * exit1().
 	 */
 	shmexit(vm);
 
 	/*
 	 * Lock the map, to wait out all other references to it.
 	 * Delete all of the mappings and pages they hold, then call
 	 * the pmap module to reclaim anything left.
 	 */
 	(void)vm_map_remove(&vm->vm_map, vm->vm_map.min_offset,
 	    vm->vm_map.max_offset);
 
 	pmap_release(vmspace_pmap(vm));
 	vm->vm_map.pmap = NULL;
 	uma_zfree(vmspace_zone, vm);
 }
 
 void
 vmspace_free(struct vmspace *vm)
 {
 
 	if (vm->vm_refcnt == 0)
 		panic("vmspace_free: attempt to free already freed vmspace");
 
 	if (atomic_fetchadd_int(&vm->vm_refcnt, -1) == 1)
 		vmspace_dofree(vm);
 }
 
 void
 vmspace_exitfree(struct proc *p)
 {
 	struct vmspace *vm;
 
 	PROC_VMSPACE_LOCK(p);
 	vm = p->p_vmspace;
 	p->p_vmspace = NULL;
 	PROC_VMSPACE_UNLOCK(p);
 	KASSERT(vm == &vmspace0, ("vmspace_exitfree: wrong vmspace"));
 	vmspace_free(vm);
 }
 
 void
 vmspace_exit(struct thread *td)
 {
 	int refcnt;
 	struct vmspace *vm;
 	struct proc *p;
 
 	/*
 	 * Release user portion of address space.
 	 * This releases references to vnodes,
 	 * which could cause I/O if the file has been unlinked.
 	 * Need to do this early enough that we can still sleep.
 	 *
 	 * The last exiting process to reach this point releases as
 	 * much of the environment as it can. vmspace_dofree() is the
 	 * slower fallback in case another process had a temporary
 	 * reference to the vmspace.
 	 */
 
 	p = td->td_proc;
 	vm = p->p_vmspace;
 	atomic_add_int(&vmspace0.vm_refcnt, 1);
 	do {
 		refcnt = vm->vm_refcnt;
 		if (refcnt > 1 && p->p_vmspace != &vmspace0) {
 			/* Switch now since other proc might free vmspace */
 			PROC_VMSPACE_LOCK(p);
 			p->p_vmspace = &vmspace0;
 			PROC_VMSPACE_UNLOCK(p);
 			pmap_activate(td);
 		}
 	} while (!atomic_cmpset_int(&vm->vm_refcnt, refcnt, refcnt - 1));
 	if (refcnt == 1) {
 		if (p->p_vmspace != vm) {
 			/* vmspace not yet freed, switch back */
 			PROC_VMSPACE_LOCK(p);
 			p->p_vmspace = vm;
 			PROC_VMSPACE_UNLOCK(p);
 			pmap_activate(td);
 		}
 		pmap_remove_pages(vmspace_pmap(vm));
 		/* Switch now since this proc will free vmspace */
 		PROC_VMSPACE_LOCK(p);
 		p->p_vmspace = &vmspace0;
 		PROC_VMSPACE_UNLOCK(p);
 		pmap_activate(td);
 		vmspace_dofree(vm);
 	}
 	vmspace_container_reset(p);
 }
 
 /* Acquire reference to vmspace owned by another process. */
 
 struct vmspace *
 vmspace_acquire_ref(struct proc *p)
 {
 	struct vmspace *vm;
 	int refcnt;
 
 	PROC_VMSPACE_LOCK(p);
 	vm = p->p_vmspace;
 	if (vm == NULL) {
 		PROC_VMSPACE_UNLOCK(p);
 		return (NULL);
 	}
 	do {
 		refcnt = vm->vm_refcnt;
 		if (refcnt <= 0) { 	/* Avoid 0->1 transition */
 			PROC_VMSPACE_UNLOCK(p);
 			return (NULL);
 		}
 	} while (!atomic_cmpset_int(&vm->vm_refcnt, refcnt, refcnt + 1));
 	if (vm != p->p_vmspace) {
 		PROC_VMSPACE_UNLOCK(p);
 		vmspace_free(vm);
 		return (NULL);
 	}
 	PROC_VMSPACE_UNLOCK(p);
 	return (vm);
 }
 
 void
 _vm_map_lock(vm_map_t map, const char *file, int line)
 {
 
 	if (map->system_map)
 		mtx_lock_flags_(&map->system_mtx, 0, file, line);
 	else
 		sx_xlock_(&map->lock, file, line);
 	map->timestamp++;
 }
 
 static void
 vm_map_process_deferred(void)
 {
 	struct thread *td;
 	vm_map_entry_t entry, next;
 	vm_object_t object;
 
 	td = curthread;
 	entry = td->td_map_def_user;
 	td->td_map_def_user = NULL;
 	while (entry != NULL) {
 		next = entry->next;
 		if ((entry->eflags & MAP_ENTRY_VN_WRITECNT) != 0) {
 			/*
 			 * Decrement the object's writemappings and
 			 * possibly the vnode's v_writecount.
 			 */
 			KASSERT((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0,
 			    ("Submap with writecount"));
 			object = entry->object.vm_object;
 			KASSERT(object != NULL, ("No object for writecount"));
 			vnode_pager_release_writecount(object, entry->start,
 			    entry->end);
 		}
 		vm_map_entry_deallocate(entry, FALSE);
 		entry = next;
 	}
 }
 
 void
 _vm_map_unlock(vm_map_t map, const char *file, int line)
 {
 
 	if (map->system_map)
 		mtx_unlock_flags_(&map->system_mtx, 0, file, line);
 	else {
 		sx_xunlock_(&map->lock, file, line);
 		vm_map_process_deferred();
 	}
 }
 
 void
 _vm_map_lock_read(vm_map_t map, const char *file, int line)
 {
 
 	if (map->system_map)
 		mtx_lock_flags_(&map->system_mtx, 0, file, line);
 	else
 		sx_slock_(&map->lock, file, line);
 }
 
 void
 _vm_map_unlock_read(vm_map_t map, const char *file, int line)
 {
 
 	if (map->system_map)
 		mtx_unlock_flags_(&map->system_mtx, 0, file, line);
 	else {
 		sx_sunlock_(&map->lock, file, line);
 		vm_map_process_deferred();
 	}
 }
 
 int
 _vm_map_trylock(vm_map_t map, const char *file, int line)
 {
 	int error;
 
 	error = map->system_map ?
 	    !mtx_trylock_flags_(&map->system_mtx, 0, file, line) :
 	    !sx_try_xlock_(&map->lock, file, line);
 	if (error == 0)
 		map->timestamp++;
 	return (error == 0);
 }
 
 int
 _vm_map_trylock_read(vm_map_t map, const char *file, int line)
 {
 	int error;
 
 	error = map->system_map ?
 	    !mtx_trylock_flags_(&map->system_mtx, 0, file, line) :
 	    !sx_try_slock_(&map->lock, file, line);
 	return (error == 0);
 }
 
 /*
  *	_vm_map_lock_upgrade:	[ internal use only ]
  *
  *	Tries to upgrade a read (shared) lock on the specified map to a write
  *	(exclusive) lock.  Returns the value "0" if the upgrade succeeds and a
  *	non-zero value if the upgrade fails.  If the upgrade fails, the map is
  *	returned without a read or write lock held.
  *
  *	Requires that the map be read locked.
  */
 int
 _vm_map_lock_upgrade(vm_map_t map, const char *file, int line)
 {
 	unsigned int last_timestamp;
 
 	if (map->system_map) {
 		mtx_assert_(&map->system_mtx, MA_OWNED, file, line);
 	} else {
 		if (!sx_try_upgrade_(&map->lock, file, line)) {
 			last_timestamp = map->timestamp;
 			sx_sunlock_(&map->lock, file, line);
 			vm_map_process_deferred();
 			/*
 			 * If the map's timestamp does not change while the
 			 * map is unlocked, then the upgrade succeeds.
 			 */
 			sx_xlock_(&map->lock, file, line);
 			if (last_timestamp != map->timestamp) {
 				sx_xunlock_(&map->lock, file, line);
 				return (1);
 			}
 		}
 	}
 	map->timestamp++;
 	return (0);
 }
 
 void
 _vm_map_lock_downgrade(vm_map_t map, const char *file, int line)
 {
 
 	if (map->system_map) {
 		mtx_assert_(&map->system_mtx, MA_OWNED, file, line);
 	} else
 		sx_downgrade_(&map->lock, file, line);
 }
 
 /*
  *	vm_map_locked:
  *
  *	Returns a non-zero value if the caller holds a write (exclusive) lock
  *	on the specified map and the value "0" otherwise.
  */
 int
 vm_map_locked(vm_map_t map)
 {
 
 	if (map->system_map)
 		return (mtx_owned(&map->system_mtx));
 	else
 		return (sx_xlocked(&map->lock));
 }
 
 #ifdef INVARIANTS
 static void
 _vm_map_assert_locked(vm_map_t map, const char *file, int line)
 {
 
 	if (map->system_map)
 		mtx_assert_(&map->system_mtx, MA_OWNED, file, line);
 	else
 		sx_assert_(&map->lock, SA_XLOCKED, file, line);
 }
 
 #define	VM_MAP_ASSERT_LOCKED(map) \
     _vm_map_assert_locked(map, LOCK_FILE, LOCK_LINE)
 #else
 #define	VM_MAP_ASSERT_LOCKED(map)
 #endif
 
 /*
  *	_vm_map_unlock_and_wait:
  *
  *	Atomically releases the lock on the specified map and puts the calling
  *	thread to sleep.  The calling thread will remain asleep until either
  *	vm_map_wakeup() is performed on the map or the specified timeout is
  *	exceeded.
  *
  *	WARNING!  This function does not perform deferred deallocations of
  *	objects and map	entries.  Therefore, the calling thread is expected to
  *	reacquire the map lock after reawakening and later perform an ordinary
  *	unlock operation, such as vm_map_unlock(), before completing its
  *	operation on the map.
  */
 int
 _vm_map_unlock_and_wait(vm_map_t map, int timo, const char *file, int line)
 {
 
 	mtx_lock(&map_sleep_mtx);
 	if (map->system_map)
 		mtx_unlock_flags_(&map->system_mtx, 0, file, line);
 	else
 		sx_xunlock_(&map->lock, file, line);
 	return (msleep(&map->root, &map_sleep_mtx, PDROP | PVM, "vmmaps",
 	    timo));
 }
 
 /*
  *	vm_map_wakeup:
  *
  *	Awaken any threads that have slept on the map using
  *	vm_map_unlock_and_wait().
  */
 void
 vm_map_wakeup(vm_map_t map)
 {
 
 	/*
 	 * Acquire and release map_sleep_mtx to prevent a wakeup()
 	 * from being performed (and lost) between the map unlock
 	 * and the msleep() in _vm_map_unlock_and_wait().
 	 */
 	mtx_lock(&map_sleep_mtx);
 	mtx_unlock(&map_sleep_mtx);
 	wakeup(&map->root);
 }
 
 void
 vm_map_busy(vm_map_t map)
 {
 
 	VM_MAP_ASSERT_LOCKED(map);
 	map->busy++;
 }
 
 void
 vm_map_unbusy(vm_map_t map)
 {
 
 	VM_MAP_ASSERT_LOCKED(map);
 	KASSERT(map->busy, ("vm_map_unbusy: not busy"));
 	if (--map->busy == 0 && (map->flags & MAP_BUSY_WAKEUP)) {
 		vm_map_modflags(map, 0, MAP_BUSY_WAKEUP);
 		wakeup(&map->busy);
 	}
 }
 
 void 
 vm_map_wait_busy(vm_map_t map)
 {
 
 	VM_MAP_ASSERT_LOCKED(map);
 	while (map->busy) {
 		vm_map_modflags(map, MAP_BUSY_WAKEUP, 0);
 		if (map->system_map)
 			msleep(&map->busy, &map->system_mtx, 0, "mbusy", 0);
 		else
 			sx_sleep(&map->busy, &map->lock, 0, "mbusy", 0);
 	}
 	map->timestamp++;
 }
 
 long
 vmspace_resident_count(struct vmspace *vmspace)
 {
 	return pmap_resident_count(vmspace_pmap(vmspace));
 }
 
 /*
  *	vm_map_create:
  *
  *	Creates and returns a new empty VM map with
  *	the given physical map structure, and having
  *	the given lower and upper address bounds.
  */
 vm_map_t
 vm_map_create(pmap_t pmap, vm_offset_t min, vm_offset_t max)
 {
 	vm_map_t result;
 
 	result = uma_zalloc(mapzone, M_WAITOK);
 	CTR1(KTR_VM, "vm_map_create: %p", result);
 	_vm_map_init(result, pmap, min, max);
 	return (result);
 }
 
 /*
  * Initialize an existing vm_map structure
  * such as that in the vmspace structure.
  */
 static void
 _vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min, vm_offset_t max)
 {
 
 	map->header.next = map->header.prev = &map->header;
 	map->needs_wakeup = FALSE;
 	map->system_map = 0;
 	map->pmap = pmap;
 	map->min_offset = min;
 	map->max_offset = max;
 	map->flags = 0;
 	map->root = NULL;
 	map->timestamp = 0;
 	map->busy = 0;
 }
 
 void
 vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min, vm_offset_t max)
 {
 
 	_vm_map_init(map, pmap, min, max);
 	mtx_init(&map->system_mtx, "system map", NULL, MTX_DEF | MTX_DUPOK);
 	sx_init(&map->lock, "user map");
 }
 
 /*
  *	vm_map_entry_dispose:	[ internal use only ]
  *
  *	Inverse of vm_map_entry_create.
  */
 static void
 vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry)
 {
 	uma_zfree(map->system_map ? kmapentzone : mapentzone, entry);
 }
 
 /*
  *	vm_map_entry_create:	[ internal use only ]
  *
  *	Allocates a VM map entry for insertion.
  *	No entry fields are filled in.
  */
 static vm_map_entry_t
 vm_map_entry_create(vm_map_t map)
 {
 	vm_map_entry_t new_entry;
 
 	if (map->system_map)
 		new_entry = uma_zalloc(kmapentzone, M_NOWAIT);
 	else
 		new_entry = uma_zalloc(mapentzone, M_WAITOK);
 	if (new_entry == NULL)
 		panic("vm_map_entry_create: kernel resources exhausted");
 	return (new_entry);
 }
 
 /*
  *	vm_map_entry_set_behavior:
  *
  *	Set the expected access behavior, either normal, random, or
  *	sequential.
  */
 static inline void
 vm_map_entry_set_behavior(vm_map_entry_t entry, u_char behavior)
 {
 	entry->eflags = (entry->eflags & ~MAP_ENTRY_BEHAV_MASK) |
 	    (behavior & MAP_ENTRY_BEHAV_MASK);
 }
 
 /*
  *	vm_map_entry_set_max_free:
  *
  *	Set the max_free field in a vm_map_entry.
  */
 static inline void
 vm_map_entry_set_max_free(vm_map_entry_t entry)
 {
 
 	entry->max_free = entry->adj_free;
 	if (entry->left != NULL && entry->left->max_free > entry->max_free)
 		entry->max_free = entry->left->max_free;
 	if (entry->right != NULL && entry->right->max_free > entry->max_free)
 		entry->max_free = entry->right->max_free;
 }
 
 /*
  *	vm_map_entry_splay:
  *
  *	The Sleator and Tarjan top-down splay algorithm with the
  *	following variation.  Max_free must be computed bottom-up, so
  *	on the downward pass, maintain the left and right spines in
  *	reverse order.  Then, make a second pass up each side to fix
  *	the pointers and compute max_free.  The time bound is O(log n)
  *	amortized.
  *
  *	The new root is the vm_map_entry containing "addr", or else an
  *	adjacent entry (lower or higher) if addr is not in the tree.
  *
  *	The map must be locked, and leaves it so.
  *
  *	Returns: the new root.
  */
 static vm_map_entry_t
 vm_map_entry_splay(vm_offset_t addr, vm_map_entry_t root)
 {
 	vm_map_entry_t llist, rlist;
 	vm_map_entry_t ltree, rtree;
 	vm_map_entry_t y;
 
 	/* Special case of empty tree. */
 	if (root == NULL)
 		return (root);
 
 	/*
 	 * Pass One: Splay down the tree until we find addr or a NULL
 	 * pointer where addr would go.  llist and rlist are the two
 	 * sides in reverse order (bottom-up), with llist linked by
 	 * the right pointer and rlist linked by the left pointer in
 	 * the vm_map_entry.  Wait until Pass Two to set max_free on
 	 * the two spines.
 	 */
 	llist = NULL;
 	rlist = NULL;
 	for (;;) {
 		/* root is never NULL in here. */
 		if (addr < root->start) {
 			y = root->left;
 			if (y == NULL)
 				break;
 			if (addr < y->start && y->left != NULL) {
 				/* Rotate right and put y on rlist. */
 				root->left = y->right;
 				y->right = root;
 				vm_map_entry_set_max_free(root);
 				root = y->left;
 				y->left = rlist;
 				rlist = y;
 			} else {
 				/* Put root on rlist. */
 				root->left = rlist;
 				rlist = root;
 				root = y;
 			}
 		} else if (addr >= root->end) {
 			y = root->right;
 			if (y == NULL)
 				break;
 			if (addr >= y->end && y->right != NULL) {
 				/* Rotate left and put y on llist. */
 				root->right = y->left;
 				y->left = root;
 				vm_map_entry_set_max_free(root);
 				root = y->right;
 				y->right = llist;
 				llist = y;
 			} else {
 				/* Put root on llist. */
 				root->right = llist;
 				llist = root;
 				root = y;
 			}
 		} else
 			break;
 	}
 
 	/*
 	 * Pass Two: Walk back up the two spines, flip the pointers
 	 * and set max_free.  The subtrees of the root go at the
 	 * bottom of llist and rlist.
 	 */
 	ltree = root->left;
 	while (llist != NULL) {
 		y = llist->right;
 		llist->right = ltree;
 		vm_map_entry_set_max_free(llist);
 		ltree = llist;
 		llist = y;
 	}
 	rtree = root->right;
 	while (rlist != NULL) {
 		y = rlist->left;
 		rlist->left = rtree;
 		vm_map_entry_set_max_free(rlist);
 		rtree = rlist;
 		rlist = y;
 	}
 
 	/*
 	 * Final assembly: add ltree and rtree as subtrees of root.
 	 */
 	root->left = ltree;
 	root->right = rtree;
 	vm_map_entry_set_max_free(root);
 
 	return (root);
 }
 
 /*
  *	vm_map_entry_{un,}link:
  *
  *	Insert/remove entries from maps.
  */
 static void
 vm_map_entry_link(vm_map_t map,
 		  vm_map_entry_t after_where,
 		  vm_map_entry_t entry)
 {
 
 	CTR4(KTR_VM,
 	    "vm_map_entry_link: map %p, nentries %d, entry %p, after %p", map,
 	    map->nentries, entry, after_where);
 	VM_MAP_ASSERT_LOCKED(map);
 	map->nentries++;
 	entry->prev = after_where;
 	entry->next = after_where->next;
 	entry->next->prev = entry;
 	after_where->next = entry;
 
 	if (after_where != &map->header) {
 		if (after_where != map->root)
 			vm_map_entry_splay(after_where->start, map->root);
 		entry->right = after_where->right;
 		entry->left = after_where;
 		after_where->right = NULL;
 		after_where->adj_free = entry->start - after_where->end;
 		vm_map_entry_set_max_free(after_where);
 	} else {
 		entry->right = map->root;
 		entry->left = NULL;
 	}
 	entry->adj_free = (entry->next == &map->header ? map->max_offset :
 	    entry->next->start) - entry->end;
 	vm_map_entry_set_max_free(entry);
 	map->root = entry;
 }
 
 static void
 vm_map_entry_unlink(vm_map_t map,
 		    vm_map_entry_t entry)
 {
 	vm_map_entry_t next, prev, root;
 
 	VM_MAP_ASSERT_LOCKED(map);
 	if (entry != map->root)
 		vm_map_entry_splay(entry->start, map->root);
 	if (entry->left == NULL)
 		root = entry->right;
 	else {
 		root = vm_map_entry_splay(entry->start, entry->left);
 		root->right = entry->right;
 		root->adj_free = (entry->next == &map->header ? map->max_offset :
 		    entry->next->start) - root->end;
 		vm_map_entry_set_max_free(root);
 	}
 	map->root = root;
 
 	prev = entry->prev;
 	next = entry->next;
 	next->prev = prev;
 	prev->next = next;
 	map->nentries--;
 	CTR3(KTR_VM, "vm_map_entry_unlink: map %p, nentries %d, entry %p", map,
 	    map->nentries, entry);
 }
 
 /*
  *	vm_map_entry_resize_free:
  *
  *	Recompute the amount of free space following a vm_map_entry
  *	and propagate that value up the tree.  Call this function after
  *	resizing a map entry in-place, that is, without a call to
  *	vm_map_entry_link() or _unlink().
  *
  *	The map must be locked, and leaves it so.
  */
 static void
 vm_map_entry_resize_free(vm_map_t map, vm_map_entry_t entry)
 {
 
 	/*
 	 * Using splay trees without parent pointers, propagating
 	 * max_free up the tree is done by moving the entry to the
 	 * root and making the change there.
 	 */
 	if (entry != map->root)
 		map->root = vm_map_entry_splay(entry->start, map->root);
 
 	entry->adj_free = (entry->next == &map->header ? map->max_offset :
 	    entry->next->start) - entry->end;
 	vm_map_entry_set_max_free(entry);
 }
 
 /*
  *	vm_map_lookup_entry:	[ internal use only ]
  *
  *	Finds the map entry containing (or
  *	immediately preceding) the specified address
  *	in the given map; the entry is returned
  *	in the "entry" parameter.  The boolean
  *	result indicates whether the address is
  *	actually contained in the map.
  */
 boolean_t
 vm_map_lookup_entry(
 	vm_map_t map,
 	vm_offset_t address,
 	vm_map_entry_t *entry)	/* OUT */
 {
 	vm_map_entry_t cur;
 	boolean_t locked;
 
 	/*
 	 * If the map is empty, then the map entry immediately preceding
 	 * "address" is the map's header.
 	 */
 	cur = map->root;
 	if (cur == NULL)
 		*entry = &map->header;
 	else if (address >= cur->start && cur->end > address) {
 		*entry = cur;
 		return (TRUE);
 	} else if ((locked = vm_map_locked(map)) ||
 	    sx_try_upgrade(&map->lock)) {
 		/*
 		 * Splay requires a write lock on the map.  However, it only
 		 * restructures the binary search tree; it does not otherwise
 		 * change the map.  Thus, the map's timestamp need not change
 		 * on a temporary upgrade.
 		 */
 		map->root = cur = vm_map_entry_splay(address, cur);
 		if (!locked)
 			sx_downgrade(&map->lock);
 
 		/*
 		 * If "address" is contained within a map entry, the new root
 		 * is that map entry.  Otherwise, the new root is a map entry
 		 * immediately before or after "address".
 		 */
 		if (address >= cur->start) {
 			*entry = cur;
 			if (cur->end > address)
 				return (TRUE);
 		} else
 			*entry = cur->prev;
 	} else
 		/*
 		 * Since the map is only locked for read access, perform a
 		 * standard binary search tree lookup for "address".
 		 */
 		for (;;) {
 			if (address < cur->start) {
 				if (cur->left == NULL) {
 					*entry = cur->prev;
 					break;
 				}
 				cur = cur->left;
 			} else if (cur->end > address) {
 				*entry = cur;
 				return (TRUE);
 			} else {
 				if (cur->right == NULL) {
 					*entry = cur;
 					break;
 				}
 				cur = cur->right;
 			}
 		}
 	return (FALSE);
 }
 
 /*
  *	vm_map_insert:
  *
  *	Inserts the given whole VM object into the target
  *	map at the specified address range.  The object's
  *	size should match that of the address range.
  *
  *	Requires that the map be locked, and leaves it so.
  *
  *	If object is non-NULL, ref count must be bumped by caller
  *	prior to making call to account for the new entry.
  */
 int
 vm_map_insert(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
 	      vm_offset_t start, vm_offset_t end, vm_prot_t prot, vm_prot_t max,
 	      int cow)
 {
 	vm_map_entry_t new_entry;
 	vm_map_entry_t prev_entry;
 	vm_map_entry_t temp_entry;
 	vm_eflags_t protoeflags;
 	struct ucred *cred;
 	vm_inherit_t inheritance;
 	boolean_t charge_prev_obj;
 
 	VM_MAP_ASSERT_LOCKED(map);
 
 	/*
 	 * Check that the start and end points are not bogus.
 	 */
 	if ((start < map->min_offset) || (end > map->max_offset) ||
 	    (start >= end))
 		return (KERN_INVALID_ADDRESS);
 
 	/*
 	 * Find the entry prior to the proposed starting address; if it's part
 	 * of an existing entry, this range is bogus.
 	 */
 	if (vm_map_lookup_entry(map, start, &temp_entry))
 		return (KERN_NO_SPACE);
 
 	prev_entry = temp_entry;
 
 	/*
 	 * Assert that the next entry doesn't overlap the end point.
 	 */
 	if ((prev_entry->next != &map->header) &&
 	    (prev_entry->next->start < end))
 		return (KERN_NO_SPACE);
 
 	protoeflags = 0;
 	charge_prev_obj = FALSE;
 
 	if (cow & MAP_COPY_ON_WRITE)
 		protoeflags |= MAP_ENTRY_COW|MAP_ENTRY_NEEDS_COPY;
 
 	if (cow & MAP_NOFAULT) {
 		protoeflags |= MAP_ENTRY_NOFAULT;
 
 		KASSERT(object == NULL,
 			("vm_map_insert: paradoxical MAP_NOFAULT request"));
 	}
 	if (cow & MAP_DISABLE_SYNCER)
 		protoeflags |= MAP_ENTRY_NOSYNC;
 	if (cow & MAP_DISABLE_COREDUMP)
 		protoeflags |= MAP_ENTRY_NOCOREDUMP;
 	if (cow & MAP_VN_WRITECOUNT)
 		protoeflags |= MAP_ENTRY_VN_WRITECNT;
 	if (cow & MAP_INHERIT_SHARE)
 		inheritance = VM_INHERIT_SHARE;
 	else
 		inheritance = VM_INHERIT_DEFAULT;
 
 	cred = NULL;
 	KASSERT((object != kmem_object && object != kernel_object) ||
 	    ((object == kmem_object || object == kernel_object) &&
 		!(protoeflags & MAP_ENTRY_NEEDS_COPY)),
 	    ("kmem or kernel object and cow"));
 	if (cow & (MAP_ACC_NO_CHARGE | MAP_NOFAULT))
 		goto charged;
 	if ((cow & MAP_ACC_CHARGED) || ((prot & VM_PROT_WRITE) &&
 	    ((protoeflags & MAP_ENTRY_NEEDS_COPY) || object == NULL))) {
 		if (!(cow & MAP_ACC_CHARGED) && !swap_reserve(end - start))
 			return (KERN_RESOURCE_SHORTAGE);
 		KASSERT(object == NULL || (protoeflags & MAP_ENTRY_NEEDS_COPY) ||
 		    object->cred == NULL,
 		    ("OVERCOMMIT: vm_map_insert o %p", object));
 		cred = curthread->td_ucred;
 		crhold(cred);
 		if (object == NULL && !(protoeflags & MAP_ENTRY_NEEDS_COPY))
 			charge_prev_obj = TRUE;
 	}
 
 charged:
 	/* Expand the kernel pmap, if necessary. */
 	if (map == kernel_map && end > kernel_vm_end)
 		pmap_growkernel(end);
 	if (object != NULL) {
 		/*
 		 * OBJ_ONEMAPPING must be cleared unless this mapping
 		 * is trivially proven to be the only mapping for any
 		 * of the object's pages.  (Object granularity
 		 * reference counting is insufficient to recognize
 		 * aliases with precision.)
 		 */
 		VM_OBJECT_LOCK(object);
 		if (object->ref_count > 1 || object->shadow_count != 0)
 			vm_object_clear_flag(object, OBJ_ONEMAPPING);
 		VM_OBJECT_UNLOCK(object);
 	}
 	else if ((prev_entry != &map->header) &&
 		 (prev_entry->eflags == protoeflags) &&
 		 (prev_entry->end == start) &&
 		 (prev_entry->wired_count == 0) &&
 		 (prev_entry->cred == cred ||
 		  (prev_entry->object.vm_object != NULL &&
 		   (prev_entry->object.vm_object->cred == cred))) &&
 		   vm_object_coalesce(prev_entry->object.vm_object,
 		       prev_entry->offset,
 		       (vm_size_t)(prev_entry->end - prev_entry->start),
 		       (vm_size_t)(end - prev_entry->end), charge_prev_obj)) {
 		/*
 		 * We were able to extend the object.  Determine if we
 		 * can extend the previous map entry to include the
 		 * new range as well.
 		 */
 		if ((prev_entry->inheritance == inheritance) &&
 		    (prev_entry->protection == prot) &&
 		    (prev_entry->max_protection == max)) {
 			map->size += (end - prev_entry->end);
 			prev_entry->end = end;
 			vm_map_entry_resize_free(map, prev_entry);
 			vm_map_simplify_entry(map, prev_entry);
 			if (cred != NULL)
 				crfree(cred);
 			return (KERN_SUCCESS);
 		}
 
 		/*
 		 * If we can extend the object but cannot extend the
 		 * map entry, we have to create a new map entry.  We
 		 * must bump the ref count on the extended object to
 		 * account for it.  object may be NULL.
 		 */
 		object = prev_entry->object.vm_object;
 		offset = prev_entry->offset +
 			(prev_entry->end - prev_entry->start);
 		vm_object_reference(object);
 		if (cred != NULL && object != NULL && object->cred != NULL &&
 		    !(prev_entry->eflags & MAP_ENTRY_NEEDS_COPY)) {
 			/* Object already accounts for this uid. */
 			crfree(cred);
 			cred = NULL;
 		}
 	}
 
 	/*
 	 * NOTE: if conditionals fail, object can be NULL here.  This occurs
 	 * in things like the buffer map where we manage kva but do not manage
 	 * backing objects.
 	 */
 
 	/*
 	 * Create a new entry
 	 */
 	new_entry = vm_map_entry_create(map);
 	new_entry->start = start;
 	new_entry->end = end;
 	new_entry->cred = NULL;
 
 	new_entry->eflags = protoeflags;
 	new_entry->object.vm_object = object;
 	new_entry->offset = offset;
 	new_entry->avail_ssize = 0;
 
 	new_entry->inheritance = inheritance;
 	new_entry->protection = prot;
 	new_entry->max_protection = max;
 	new_entry->wired_count = 0;
 	new_entry->read_ahead = VM_FAULT_READ_AHEAD_INIT;
 	new_entry->next_read = OFF_TO_IDX(offset);
 
 	KASSERT(cred == NULL || !ENTRY_CHARGED(new_entry),
 	    ("OVERCOMMIT: vm_map_insert leaks vm_map %p", new_entry));
 	new_entry->cred = cred;
 
 	/*
 	 * Insert the new entry into the list
 	 */
 	vm_map_entry_link(map, prev_entry, new_entry);
 	map->size += new_entry->end - new_entry->start;
 
 	/*
 	 * It may be possible to merge the new entry with the next and/or
 	 * previous entries.  However, due to MAP_STACK_* being a hack, a
 	 * panic can result from merging such entries.
 	 */
 	if ((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0)
 		vm_map_simplify_entry(map, new_entry);
 
 	if (cow & (MAP_PREFAULT|MAP_PREFAULT_PARTIAL)) {
 		vm_map_pmap_enter(map, start, prot,
 				    object, OFF_TO_IDX(offset), end - start,
 				    cow & MAP_PREFAULT_PARTIAL);
 	}
 
 	return (KERN_SUCCESS);
 }
 
 /*
  *	vm_map_findspace:
  *
  *	Find the first fit (lowest VM address) for "length" free bytes
  *	beginning at address >= start in the given map.
  *
  *	In a vm_map_entry, "adj_free" is the amount of free space
  *	adjacent (higher address) to this entry, and "max_free" is the
  *	maximum amount of contiguous free space in its subtree.  This
  *	allows finding a free region in one path down the tree, so
  *	O(log n) amortized with splay trees.
  *
  *	The map must be locked, and leaves it so.
  *
  *	Returns: 0 on success, and starting address in *addr,
  *		 1 if insufficient space.
  */
 int
 vm_map_findspace(vm_map_t map, vm_offset_t start, vm_size_t length,
     vm_offset_t *addr)	/* OUT */
 {
 	vm_map_entry_t entry;
 	vm_offset_t st;
 
 	/*
 	 * Request must fit within min/max VM address and must avoid
 	 * address wrap.
 	 */
 	if (start < map->min_offset)
 		start = map->min_offset;
 	if (start + length > map->max_offset || start + length < start)
 		return (1);
 
 	/* Empty tree means wide open address space. */
 	if (map->root == NULL) {
 		*addr = start;
 		return (0);
 	}
 
 	/*
 	 * After splay, if start comes before root node, then there
 	 * must be a gap from start to the root.
 	 */
 	map->root = vm_map_entry_splay(start, map->root);
 	if (start + length <= map->root->start) {
 		*addr = start;
 		return (0);
 	}
 
 	/*
 	 * Root is the last node that might begin its gap before
 	 * start, and this is the last comparison where address
 	 * wrap might be a problem.
 	 */
 	st = (start > map->root->end) ? start : map->root->end;
 	if (length <= map->root->end + map->root->adj_free - st) {
 		*addr = st;
 		return (0);
 	}
 
 	/* With max_free, can immediately tell if no solution. */
 	entry = map->root->right;
 	if (entry == NULL || length > entry->max_free)
 		return (1);
 
 	/*
 	 * Search the right subtree in the order: left subtree, root,
 	 * right subtree (first fit).  The previous splay implies that
 	 * all regions in the right subtree have addresses > start.
 	 */
 	while (entry != NULL) {
 		if (entry->left != NULL && entry->left->max_free >= length)
 			entry = entry->left;
 		else if (entry->adj_free >= length) {
 			*addr = entry->end;
 			return (0);
 		} else
 			entry = entry->right;
 	}
 
 	/* Can't get here, so panic if we do. */
 	panic("vm_map_findspace: max_free corrupt");
 }
 
 int
 vm_map_fixed(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
     vm_offset_t start, vm_size_t length, vm_prot_t prot,
     vm_prot_t max, int cow)
 {
 	vm_offset_t end;
 	int result;
 
 	end = start + length;
 	vm_map_lock(map);
 	VM_MAP_RANGE_CHECK(map, start, end);
 	(void) vm_map_delete(map, start, end);
 	result = vm_map_insert(map, object, offset, start, end, prot,
 	    max, cow);
 	vm_map_unlock(map);
 	return (result);
 }
 
 /*
  *	vm_map_find finds an unallocated region in the target address
  *	map with the given length.  The search is defined to be
  *	first-fit from the specified address; the region found is
  *	returned in the same parameter.
  *
  *	If object is non-NULL, ref count must be bumped by caller
  *	prior to making call to account for the new entry.
  */
 int
 vm_map_find(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
 	    vm_offset_t *addr,	/* IN/OUT */
 	    vm_size_t length, int find_space, vm_prot_t prot,
 	    vm_prot_t max, int cow)
 {
 	vm_offset_t start;
 	int result;
 
 	start = *addr;
 	vm_map_lock(map);
 	do {
 		if (find_space != VMFS_NO_SPACE) {
 			if (vm_map_findspace(map, start, length, addr)) {
 				vm_map_unlock(map);
 				return (KERN_NO_SPACE);
 			}
 			switch (find_space) {
 			case VMFS_ALIGNED_SPACE:
 				pmap_align_superpage(object, offset, addr,
 				    length);
 				break;
 #ifdef VMFS_TLB_ALIGNED_SPACE
 			case VMFS_TLB_ALIGNED_SPACE:
 				pmap_align_tlb(addr);
 				break;
 #endif
 			default:
 				break;
 			}
 
 			start = *addr;
 		}
 		result = vm_map_insert(map, object, offset, start, start +
 		    length, prot, max, cow);
 	} while (result == KERN_NO_SPACE && (find_space == VMFS_ALIGNED_SPACE
 #ifdef VMFS_TLB_ALIGNED_SPACE
 	    || find_space == VMFS_TLB_ALIGNED_SPACE
 #endif
 	    ));
 	vm_map_unlock(map);
 	return (result);
 }
 
 /*
  *	vm_map_simplify_entry:
  *
  *	Simplify the given map entry by merging with either neighbor.  This
  *	routine also has the ability to merge with both neighbors.
  *
  *	The map must be locked.
  *
  *	This routine guarentees that the passed entry remains valid (though
  *	possibly extended).  When merging, this routine may delete one or
  *	both neighbors.
  */
 void
 vm_map_simplify_entry(vm_map_t map, vm_map_entry_t entry)
 {
 	vm_map_entry_t next, prev;
 	vm_size_t prevsize, esize;
 
 	if (entry->eflags & (MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_IS_SUB_MAP))
 		return;
 
 	prev = entry->prev;
 	if (prev != &map->header) {
 		prevsize = prev->end - prev->start;
 		if ( (prev->end == entry->start) &&
 		     (prev->object.vm_object == entry->object.vm_object) &&
 		     (!prev->object.vm_object ||
 			(prev->offset + prevsize == entry->offset)) &&
 		     (prev->eflags == entry->eflags) &&
 		     (prev->protection == entry->protection) &&
 		     (prev->max_protection == entry->max_protection) &&
 		     (prev->inheritance == entry->inheritance) &&
 		     (prev->wired_count == entry->wired_count) &&
 		     (prev->cred == entry->cred)) {
 			vm_map_entry_unlink(map, prev);
 			entry->start = prev->start;
 			entry->offset = prev->offset;
 			if (entry->prev != &map->header)
 				vm_map_entry_resize_free(map, entry->prev);
 
 			/*
 			 * If the backing object is a vnode object,
 			 * vm_object_deallocate() calls vrele().
 			 * However, vrele() does not lock the vnode
 			 * because the vnode has additional
 			 * references.  Thus, the map lock can be kept
 			 * without causing a lock-order reversal with
 			 * the vnode lock.
 			 *
 			 * Since we count the number of virtual page
 			 * mappings in object->un_pager.vnp.writemappings,
 			 * the writemappings value should not be adjusted
 			 * when the entry is disposed of.
 			 */
 			if (prev->object.vm_object)
 				vm_object_deallocate(prev->object.vm_object);
 			if (prev->cred != NULL)
 				crfree(prev->cred);
 			vm_map_entry_dispose(map, prev);
 		}
 	}
 
 	next = entry->next;
 	if (next != &map->header) {
 		esize = entry->end - entry->start;
 		if ((entry->end == next->start) &&
 		    (next->object.vm_object == entry->object.vm_object) &&
 		     (!entry->object.vm_object ||
 			(entry->offset + esize == next->offset)) &&
 		    (next->eflags == entry->eflags) &&
 		    (next->protection == entry->protection) &&
 		    (next->max_protection == entry->max_protection) &&
 		    (next->inheritance == entry->inheritance) &&
 		    (next->wired_count == entry->wired_count) &&
 		    (next->cred == entry->cred)) {
 			vm_map_entry_unlink(map, next);
 			entry->end = next->end;
 			vm_map_entry_resize_free(map, entry);
 
 			/*
 			 * See comment above.
 			 */
 			if (next->object.vm_object)
 				vm_object_deallocate(next->object.vm_object);
 			if (next->cred != NULL)
 				crfree(next->cred);
 			vm_map_entry_dispose(map, next);
 		}
 	}
 }
 /*
  *	vm_map_clip_start:	[ internal use only ]
  *
  *	Asserts that the given entry begins at or after
  *	the specified address; if necessary,
  *	it splits the entry into two.
  */
 #define vm_map_clip_start(map, entry, startaddr) \
 { \
 	if (startaddr > entry->start) \
 		_vm_map_clip_start(map, entry, startaddr); \
 }
 
 /*
  *	This routine is called only when it is known that
  *	the entry must be split.
  */
 static void
 _vm_map_clip_start(vm_map_t map, vm_map_entry_t entry, vm_offset_t start)
 {
 	vm_map_entry_t new_entry;
 
 	VM_MAP_ASSERT_LOCKED(map);
 
 	/*
 	 * Split off the front portion -- note that we must insert the new
 	 * entry BEFORE this one, so that this entry has the specified
 	 * starting address.
 	 */
 	vm_map_simplify_entry(map, entry);
 
 	/*
 	 * If there is no object backing this entry, we might as well create
 	 * one now.  If we defer it, an object can get created after the map
 	 * is clipped, and individual objects will be created for the split-up
 	 * map.  This is a bit of a hack, but is also about the best place to
 	 * put this improvement.
 	 */
 	if (entry->object.vm_object == NULL && !map->system_map) {
 		vm_object_t object;
 		object = vm_object_allocate(OBJT_DEFAULT,
 				atop(entry->end - entry->start));
 		entry->object.vm_object = object;
 		entry->offset = 0;
 		if (entry->cred != NULL) {
 			object->cred = entry->cred;
 			object->charge = entry->end - entry->start;
 			entry->cred = NULL;
 		}
 	} else if (entry->object.vm_object != NULL &&
 		   ((entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) &&
 		   entry->cred != NULL) {
 		VM_OBJECT_LOCK(entry->object.vm_object);
 		KASSERT(entry->object.vm_object->cred == NULL,
 		    ("OVERCOMMIT: vm_entry_clip_start: both cred e %p", entry));
 		entry->object.vm_object->cred = entry->cred;
 		entry->object.vm_object->charge = entry->end - entry->start;
 		VM_OBJECT_UNLOCK(entry->object.vm_object);
 		entry->cred = NULL;
 	}
 
 	new_entry = vm_map_entry_create(map);
 	*new_entry = *entry;
 
 	new_entry->end = start;
 	entry->offset += (start - entry->start);
 	entry->start = start;
 	if (new_entry->cred != NULL)
 		crhold(entry->cred);
 
 	vm_map_entry_link(map, entry->prev, new_entry);
 
 	if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
 		vm_object_reference(new_entry->object.vm_object);
 		/*
 		 * The object->un_pager.vnp.writemappings for the
 		 * object of MAP_ENTRY_VN_WRITECNT type entry shall be
 		 * kept as is here.  The virtual pages are
 		 * re-distributed among the clipped entries, so the sum is
 		 * left the same.
 		 */
 	}
 }
 
 /*
  *	vm_map_clip_end:	[ internal use only ]
  *
  *	Asserts that the given entry ends at or before
  *	the specified address; if necessary,
  *	it splits the entry into two.
  */
 #define vm_map_clip_end(map, entry, endaddr) \
 { \
 	if ((endaddr) < (entry->end)) \
 		_vm_map_clip_end((map), (entry), (endaddr)); \
 }
 
 /*
  *	This routine is called only when it is known that
  *	the entry must be split.
  */
 static void
 _vm_map_clip_end(vm_map_t map, vm_map_entry_t entry, vm_offset_t end)
 {
 	vm_map_entry_t new_entry;
 
 	VM_MAP_ASSERT_LOCKED(map);
 
 	/*
 	 * If there is no object backing this entry, we might as well create
 	 * one now.  If we defer it, an object can get created after the map
 	 * is clipped, and individual objects will be created for the split-up
 	 * map.  This is a bit of a hack, but is also about the best place to
 	 * put this improvement.
 	 */
 	if (entry->object.vm_object == NULL && !map->system_map) {
 		vm_object_t object;
 		object = vm_object_allocate(OBJT_DEFAULT,
 				atop(entry->end - entry->start));
 		entry->object.vm_object = object;
 		entry->offset = 0;
 		if (entry->cred != NULL) {
 			object->cred = entry->cred;
 			object->charge = entry->end - entry->start;
 			entry->cred = NULL;
 		}
 	} else if (entry->object.vm_object != NULL &&
 		   ((entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) &&
 		   entry->cred != NULL) {
 		VM_OBJECT_LOCK(entry->object.vm_object);
 		KASSERT(entry->object.vm_object->cred == NULL,
 		    ("OVERCOMMIT: vm_entry_clip_end: both cred e %p", entry));
 		entry->object.vm_object->cred = entry->cred;
 		entry->object.vm_object->charge = entry->end - entry->start;
 		VM_OBJECT_UNLOCK(entry->object.vm_object);
 		entry->cred = NULL;
 	}
 
 	/*
 	 * Create a new entry and insert it AFTER the specified entry
 	 */
 	new_entry = vm_map_entry_create(map);
 	*new_entry = *entry;
 
 	new_entry->start = entry->end = end;
 	new_entry->offset += (end - entry->start);
 	if (new_entry->cred != NULL)
 		crhold(entry->cred);
 
 	vm_map_entry_link(map, entry, new_entry);
 
 	if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
 		vm_object_reference(new_entry->object.vm_object);
 	}
 }
 
 /*
  *	vm_map_submap:		[ kernel use only ]
  *
  *	Mark the given range as handled by a subordinate map.
  *
  *	This range must have been created with vm_map_find,
  *	and no other operations may have been performed on this
  *	range prior to calling vm_map_submap.
  *
  *	Only a limited number of operations can be performed
  *	within this rage after calling vm_map_submap:
  *		vm_fault
  *	[Don't try vm_map_copy!]
  *
  *	To remove a submapping, one must first remove the
  *	range from the superior map, and then destroy the
  *	submap (if desired).  [Better yet, don't try it.]
  */
 int
 vm_map_submap(
 	vm_map_t map,
 	vm_offset_t start,
 	vm_offset_t end,
 	vm_map_t submap)
 {
 	vm_map_entry_t entry;
 	int result = KERN_INVALID_ARGUMENT;
 
 	vm_map_lock(map);
 
 	VM_MAP_RANGE_CHECK(map, start, end);
 
 	if (vm_map_lookup_entry(map, start, &entry)) {
 		vm_map_clip_start(map, entry, start);
 	} else
 		entry = entry->next;
 
 	vm_map_clip_end(map, entry, end);
 
 	if ((entry->start == start) && (entry->end == end) &&
 	    ((entry->eflags & MAP_ENTRY_COW) == 0) &&
 	    (entry->object.vm_object == NULL)) {
 		entry->object.sub_map = submap;
 		entry->eflags |= MAP_ENTRY_IS_SUB_MAP;
 		result = KERN_SUCCESS;
 	}
 	vm_map_unlock(map);
 
 	return (result);
 }
 
 /*
  * The maximum number of pages to map
  */
 #define	MAX_INIT_PT	96
 
 /*
  *	vm_map_pmap_enter:
  *
  *	Preload read-only mappings for the specified object's resident pages
  *	into the target map.  If "flags" is MAP_PREFAULT_PARTIAL, then only
  *	the resident pages within the address range [addr, addr + ulmin(size,
  *	ptoa(MAX_INIT_PT))) are mapped.  Otherwise, all resident pages within
  *	the specified address range are mapped.  This eliminates many soft
  *	faults on process startup and immediately after an mmap(2).  Because
  *	these are speculative mappings, cached pages are not reactivated and
  *	mapped.
  */
 void
 vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_prot_t prot,
     vm_object_t object, vm_pindex_t pindex, vm_size_t size, int flags)
 {
 	vm_offset_t start;
 	vm_page_t p, p_start;
 	vm_pindex_t psize, tmpidx;
 
 	if ((prot & (VM_PROT_READ | VM_PROT_EXECUTE)) == 0 || object == NULL)
 		return;
 	VM_OBJECT_LOCK(object);
 	if (object->type == OBJT_DEVICE || object->type == OBJT_SG) {
 		pmap_object_init_pt(map->pmap, addr, object, pindex, size);
 		goto unlock_return;
 	}
 
 	psize = atop(size);
 	if (psize > MAX_INIT_PT && (flags & MAP_PREFAULT_PARTIAL) != 0)
 		psize = MAX_INIT_PT;
 	if (psize + pindex > object->size) {
 		if (object->size < pindex)
 			goto unlock_return;
 		psize = object->size - pindex;
 	}
 
 	start = 0;
 	p_start = NULL;
 
 	p = vm_page_find_least(object, pindex);
 	/*
 	 * Assert: the variable p is either (1) the page with the
 	 * least pindex greater than or equal to the parameter pindex
 	 * or (2) NULL.
 	 */
 	for (;
 	     p != NULL && (tmpidx = p->pindex - pindex) < psize;
 	     p = TAILQ_NEXT(p, listq)) {
 		/*
 		 * don't allow an madvise to blow away our really
 		 * free pages allocating pv entries.
 		 */
 		if ((flags & MAP_PREFAULT_MADVISE) &&
 		    cnt.v_free_count < cnt.v_free_reserved) {
 			psize = tmpidx;
 			break;
 		}
 		if (p->valid == VM_PAGE_BITS_ALL) {
 			if (p_start == NULL) {
 				start = addr + ptoa(tmpidx);
 				p_start = p;
 			}
 		} else if (p_start != NULL) {
 			pmap_enter_object(map->pmap, start, addr +
 			    ptoa(tmpidx), p_start, prot);
 			p_start = NULL;
 		}
 	}
 	if (p_start != NULL)
 		pmap_enter_object(map->pmap, start, addr + ptoa(psize),
 		    p_start, prot);
 unlock_return:
 	VM_OBJECT_UNLOCK(object);
 }
 
 /*
  *	vm_map_protect:
  *
  *	Sets the protection of the specified address
  *	region in the target map.  If "set_max" is
  *	specified, the maximum protection is to be set;
  *	otherwise, only the current protection is affected.
  */
 int
 vm_map_protect(vm_map_t map, vm_offset_t start, vm_offset_t end,
 	       vm_prot_t new_prot, boolean_t set_max)
 {
 	vm_map_entry_t current, entry;
 	vm_object_t obj;
 	struct ucred *cred;
 	vm_prot_t old_prot;
 
 	vm_map_lock(map);
 
 	VM_MAP_RANGE_CHECK(map, start, end);
 
 	if (vm_map_lookup_entry(map, start, &entry)) {
 		vm_map_clip_start(map, entry, start);
 	} else {
 		entry = entry->next;
 	}
 
 	/*
 	 * Make a first pass to check for protection violations.
 	 */
 	current = entry;
 	while ((current != &map->header) && (current->start < end)) {
 		if (current->eflags & MAP_ENTRY_IS_SUB_MAP) {
 			vm_map_unlock(map);
 			return (KERN_INVALID_ARGUMENT);
 		}
 		if ((new_prot & current->max_protection) != new_prot) {
 			vm_map_unlock(map);
 			return (KERN_PROTECTION_FAILURE);
 		}
 		current = current->next;
 	}
 
 
 	/*
 	 * Do an accounting pass for private read-only mappings that
 	 * now will do cow due to allowed write (e.g. debugger sets
 	 * breakpoint on text segment)
 	 */
 	for (current = entry; (current != &map->header) &&
 	     (current->start < end); current = current->next) {
 
 		vm_map_clip_end(map, current, end);
 
 		if (set_max ||
 		    ((new_prot & ~(current->protection)) & VM_PROT_WRITE) == 0 ||
 		    ENTRY_CHARGED(current)) {
 			continue;
 		}
 
 		cred = curthread->td_ucred;
 		obj = current->object.vm_object;
 
 		if (obj == NULL || (current->eflags & MAP_ENTRY_NEEDS_COPY)) {
 			if (!swap_reserve(current->end - current->start)) {
 				vm_map_unlock(map);
 				return (KERN_RESOURCE_SHORTAGE);
 			}
 			crhold(cred);
 			current->cred = cred;
 			continue;
 		}
 
 		VM_OBJECT_LOCK(obj);
 		if (obj->type != OBJT_DEFAULT && obj->type != OBJT_SWAP) {
 			VM_OBJECT_UNLOCK(obj);
 			continue;
 		}
 
 		/*
 		 * Charge for the whole object allocation now, since
 		 * we cannot distinguish between non-charged and
 		 * charged clipped mapping of the same object later.
 		 */
 		KASSERT(obj->charge == 0,
 		    ("vm_map_protect: object %p overcharged\n", obj));
 		if (!swap_reserve(ptoa(obj->size))) {
 			VM_OBJECT_UNLOCK(obj);
 			vm_map_unlock(map);
 			return (KERN_RESOURCE_SHORTAGE);
 		}
 
 		crhold(cred);
 		obj->cred = cred;
 		obj->charge = ptoa(obj->size);
 		VM_OBJECT_UNLOCK(obj);
 	}
 
 	/*
 	 * Go back and fix up protections. [Note that clipping is not
 	 * necessary the second time.]
 	 */
 	current = entry;
 	while ((current != &map->header) && (current->start < end)) {
 		old_prot = current->protection;
 
 		if (set_max)
 			current->protection =
 			    (current->max_protection = new_prot) &
 			    old_prot;
 		else
 			current->protection = new_prot;
 
 		if ((current->eflags & (MAP_ENTRY_COW | MAP_ENTRY_USER_WIRED))
 		     == (MAP_ENTRY_COW | MAP_ENTRY_USER_WIRED) &&
 		    (current->protection & VM_PROT_WRITE) != 0 &&
 		    (old_prot & VM_PROT_WRITE) == 0) {
 			vm_fault_copy_entry(map, map, current, current, NULL);
 		}
 
 		/*
 		 * When restricting access, update the physical map.  Worry
 		 * about copy-on-write here.
 		 */
 		if ((old_prot & ~current->protection) != 0) {
 #define MASK(entry)	(((entry)->eflags & MAP_ENTRY_COW) ? ~VM_PROT_WRITE : \
 							VM_PROT_ALL)
 			pmap_protect(map->pmap, current->start,
 			    current->end,
 			    current->protection & MASK(current));
 #undef	MASK
 		}
 		vm_map_simplify_entry(map, current);
 		current = current->next;
 	}
 	vm_map_unlock(map);
 	return (KERN_SUCCESS);
 }
 
 /*
  *	vm_map_madvise:
  *
  *	This routine traverses a processes map handling the madvise
  *	system call.  Advisories are classified as either those effecting
  *	the vm_map_entry structure, or those effecting the underlying
  *	objects.
  */
 int
 vm_map_madvise(
 	vm_map_t map,
 	vm_offset_t start,
 	vm_offset_t end,
 	int behav)
 {
 	vm_map_entry_t current, entry;
 	int modify_map = 0;
 
 	/*
 	 * Some madvise calls directly modify the vm_map_entry, in which case
 	 * we need to use an exclusive lock on the map and we need to perform
 	 * various clipping operations.  Otherwise we only need a read-lock
 	 * on the map.
 	 */
 	switch(behav) {
 	case MADV_NORMAL:
 	case MADV_SEQUENTIAL:
 	case MADV_RANDOM:
 	case MADV_NOSYNC:
 	case MADV_AUTOSYNC:
 	case MADV_NOCORE:
 	case MADV_CORE:
 		modify_map = 1;
 		vm_map_lock(map);
 		break;
 	case MADV_WILLNEED:
 	case MADV_DONTNEED:
 	case MADV_FREE:
 		vm_map_lock_read(map);
 		break;
 	default:
 		return (KERN_INVALID_ARGUMENT);
 	}
 
 	/*
 	 * Locate starting entry and clip if necessary.
 	 */
 	VM_MAP_RANGE_CHECK(map, start, end);
 
 	if (vm_map_lookup_entry(map, start, &entry)) {
 		if (modify_map)
 			vm_map_clip_start(map, entry, start);
 	} else {
 		entry = entry->next;
 	}
 
 	if (modify_map) {
 		/*
 		 * madvise behaviors that are implemented in the vm_map_entry.
 		 *
 		 * We clip the vm_map_entry so that behavioral changes are
 		 * limited to the specified address range.
 		 */
 		for (current = entry;
 		     (current != &map->header) && (current->start < end);
 		     current = current->next
 		) {
 			if (current->eflags & MAP_ENTRY_IS_SUB_MAP)
 				continue;
 
 			vm_map_clip_end(map, current, end);
 
 			switch (behav) {
 			case MADV_NORMAL:
 				vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_NORMAL);
 				break;
 			case MADV_SEQUENTIAL:
 				vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_SEQUENTIAL);
 				break;
 			case MADV_RANDOM:
 				vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_RANDOM);
 				break;
 			case MADV_NOSYNC:
 				current->eflags |= MAP_ENTRY_NOSYNC;
 				break;
 			case MADV_AUTOSYNC:
 				current->eflags &= ~MAP_ENTRY_NOSYNC;
 				break;
 			case MADV_NOCORE:
 				current->eflags |= MAP_ENTRY_NOCOREDUMP;
 				break;
 			case MADV_CORE:
 				current->eflags &= ~MAP_ENTRY_NOCOREDUMP;
 				break;
 			default:
 				break;
 			}
 			vm_map_simplify_entry(map, current);
 		}
 		vm_map_unlock(map);
 	} else {
 		vm_pindex_t pstart, pend;
 
 		/*
 		 * madvise behaviors that are implemented in the underlying
 		 * vm_object.
 		 *
 		 * Since we don't clip the vm_map_entry, we have to clip
 		 * the vm_object pindex and count.
 		 */
 		for (current = entry;
 		     (current != &map->header) && (current->start < end);
 		     current = current->next
 		) {
 			vm_offset_t useStart;
 
 			if (current->eflags & MAP_ENTRY_IS_SUB_MAP)
 				continue;
 
 			pstart = OFF_TO_IDX(current->offset);
 			pend = pstart + atop(current->end - current->start);
 			useStart = current->start;
 
 			if (current->start < start) {
 				pstart += atop(start - current->start);
 				useStart = start;
 			}
 			if (current->end > end)
 				pend -= atop(current->end - end);
 
 			if (pstart >= pend)
 				continue;
 
 			vm_object_madvise(current->object.vm_object, pstart,
 			    pend, behav);
 			if (behav == MADV_WILLNEED) {
 				vm_map_pmap_enter(map,
 				    useStart,
 				    current->protection,
 				    current->object.vm_object,
 				    pstart,
 				    ptoa(pend - pstart),
 				    MAP_PREFAULT_MADVISE
 				);
 			}
 		}
 		vm_map_unlock_read(map);
 	}
 	return (0);
 }
 
 
 /*
  *	vm_map_inherit:
  *
  *	Sets the inheritance of the specified address
  *	range in the target map.  Inheritance
  *	affects how the map will be shared with
  *	child maps at the time of vmspace_fork.
  */
 int
 vm_map_inherit(vm_map_t map, vm_offset_t start, vm_offset_t end,
 	       vm_inherit_t new_inheritance)
 {
 	vm_map_entry_t entry;
 	vm_map_entry_t temp_entry;
 
 	switch (new_inheritance) {
 	case VM_INHERIT_NONE:
 	case VM_INHERIT_COPY:
 	case VM_INHERIT_SHARE:
 		break;
 	default:
 		return (KERN_INVALID_ARGUMENT);
 	}
 	vm_map_lock(map);
 	VM_MAP_RANGE_CHECK(map, start, end);
 	if (vm_map_lookup_entry(map, start, &temp_entry)) {
 		entry = temp_entry;
 		vm_map_clip_start(map, entry, start);
 	} else
 		entry = temp_entry->next;
 	while ((entry != &map->header) && (entry->start < end)) {
 		vm_map_clip_end(map, entry, end);
 		entry->inheritance = new_inheritance;
 		vm_map_simplify_entry(map, entry);
 		entry = entry->next;
 	}
 	vm_map_unlock(map);
 	return (KERN_SUCCESS);
 }
 
 /*
  *	vm_map_unwire:
  *
  *	Implements both kernel and user unwiring.
  */
 int
 vm_map_unwire(vm_map_t map, vm_offset_t start, vm_offset_t end,
     int flags)
 {
 	vm_map_entry_t entry, first_entry, tmp_entry;
 	vm_offset_t saved_start;
 	unsigned int last_timestamp;
 	int rv;
 	boolean_t need_wakeup, result, user_unwire;
 
 	user_unwire = (flags & VM_MAP_WIRE_USER) ? TRUE : FALSE;
 	vm_map_lock(map);
 	VM_MAP_RANGE_CHECK(map, start, end);
 	if (!vm_map_lookup_entry(map, start, &first_entry)) {
 		if (flags & VM_MAP_WIRE_HOLESOK)
 			first_entry = first_entry->next;
 		else {
 			vm_map_unlock(map);
 			return (KERN_INVALID_ADDRESS);
 		}
 	}
 	last_timestamp = map->timestamp;
 	entry = first_entry;
 	while (entry != &map->header && entry->start < end) {
 		if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
 			/*
 			 * We have not yet clipped the entry.
 			 */
 			saved_start = (start >= entry->start) ? start :
 			    entry->start;
 			entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
 			if (vm_map_unlock_and_wait(map, 0)) {
 				/*
 				 * Allow interruption of user unwiring?
 				 */
 			}
 			vm_map_lock(map);
 			if (last_timestamp+1 != map->timestamp) {
 				/*
 				 * Look again for the entry because the map was
 				 * modified while it was unlocked.
 				 * Specifically, the entry may have been
 				 * clipped, merged, or deleted.
 				 */
 				if (!vm_map_lookup_entry(map, saved_start,
 				    &tmp_entry)) {
 					if (flags & VM_MAP_WIRE_HOLESOK)
 						tmp_entry = tmp_entry->next;
 					else {
 						if (saved_start == start) {
 							/*
 							 * First_entry has been deleted.
 							 */
 							vm_map_unlock(map);
 							return (KERN_INVALID_ADDRESS);
 						}
 						end = saved_start;
 						rv = KERN_INVALID_ADDRESS;
 						goto done;
 					}
 				}
 				if (entry == first_entry)
 					first_entry = tmp_entry;
 				else
 					first_entry = NULL;
 				entry = tmp_entry;
 			}
 			last_timestamp = map->timestamp;
 			continue;
 		}
 		vm_map_clip_start(map, entry, start);
 		vm_map_clip_end(map, entry, end);
 		/*
 		 * Mark the entry in case the map lock is released.  (See
 		 * above.)
 		 */
 		entry->eflags |= MAP_ENTRY_IN_TRANSITION;
 		/*
 		 * Check the map for holes in the specified region.
 		 * If VM_MAP_WIRE_HOLESOK was specified, skip this check.
 		 */
 		if (((flags & VM_MAP_WIRE_HOLESOK) == 0) &&
 		    (entry->end < end && (entry->next == &map->header ||
 		    entry->next->start > entry->end))) {
 			end = entry->end;
 			rv = KERN_INVALID_ADDRESS;
 			goto done;
 		}
 		/*
 		 * If system unwiring, require that the entry is system wired.
 		 */
 		if (!user_unwire &&
 		    vm_map_entry_system_wired_count(entry) == 0) {
 			end = entry->end;
 			rv = KERN_INVALID_ARGUMENT;
 			goto done;
 		}
 		entry = entry->next;
 	}
 	rv = KERN_SUCCESS;
 done:
 	need_wakeup = FALSE;
 	if (first_entry == NULL) {
 		result = vm_map_lookup_entry(map, start, &first_entry);
 		if (!result && (flags & VM_MAP_WIRE_HOLESOK))
 			first_entry = first_entry->next;
 		else
 			KASSERT(result, ("vm_map_unwire: lookup failed"));
 	}
 	entry = first_entry;
 	while (entry != &map->header && entry->start < end) {
 		if (rv == KERN_SUCCESS && (!user_unwire ||
 		    (entry->eflags & MAP_ENTRY_USER_WIRED))) {
 			if (user_unwire)
 				entry->eflags &= ~MAP_ENTRY_USER_WIRED;
 			entry->wired_count--;
 			if (entry->wired_count == 0) {
 				/*
 				 * Retain the map lock.
 				 */
 				vm_fault_unwire(map, entry->start, entry->end,
 				    entry->object.vm_object != NULL &&
 				    (entry->object.vm_object->flags &
 				    OBJ_FICTITIOUS) != 0);
 			}
 		}
 		KASSERT(entry->eflags & MAP_ENTRY_IN_TRANSITION,
 			("vm_map_unwire: in-transition flag missing"));
 		entry->eflags &= ~MAP_ENTRY_IN_TRANSITION;
 		if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) {
 			entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP;
 			need_wakeup = TRUE;
 		}
 		vm_map_simplify_entry(map, entry);
 		entry = entry->next;
 	}
 	vm_map_unlock(map);
 	if (need_wakeup)
 		vm_map_wakeup(map);
 	return (rv);
 }
 
 /*
  *	vm_map_wire:
  *
  *	Implements both kernel and user wiring.
  */
 int
 vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t end,
     int flags)
 {
 	vm_map_entry_t entry, first_entry, tmp_entry;
 	vm_offset_t saved_end, saved_start;
 	unsigned int last_timestamp;
 	int rv;
 	boolean_t fictitious, need_wakeup, result, user_wire;
 	vm_prot_t prot;
 
 	prot = 0;
 	if (flags & VM_MAP_WIRE_WRITE)
 		prot |= VM_PROT_WRITE;
 	user_wire = (flags & VM_MAP_WIRE_USER) ? TRUE : FALSE;
 	vm_map_lock(map);
 	VM_MAP_RANGE_CHECK(map, start, end);
 	if (!vm_map_lookup_entry(map, start, &first_entry)) {
 		if (flags & VM_MAP_WIRE_HOLESOK)
 			first_entry = first_entry->next;
 		else {
 			vm_map_unlock(map);
 			return (KERN_INVALID_ADDRESS);
 		}
 	}
 	last_timestamp = map->timestamp;
 	entry = first_entry;
 	while (entry != &map->header && entry->start < end) {
 		if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
 			/*
 			 * We have not yet clipped the entry.
 			 */
 			saved_start = (start >= entry->start) ? start :
 			    entry->start;
 			entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
 			if (vm_map_unlock_and_wait(map, 0)) {
 				/*
 				 * Allow interruption of user wiring?
 				 */
 			}
 			vm_map_lock(map);
 			if (last_timestamp + 1 != map->timestamp) {
 				/*
 				 * Look again for the entry because the map was
 				 * modified while it was unlocked.
 				 * Specifically, the entry may have been
 				 * clipped, merged, or deleted.
 				 */
 				if (!vm_map_lookup_entry(map, saved_start,
 				    &tmp_entry)) {
 					if (flags & VM_MAP_WIRE_HOLESOK)
 						tmp_entry = tmp_entry->next;
 					else {
 						if (saved_start == start) {
 							/*
 							 * first_entry has been deleted.
 							 */
 							vm_map_unlock(map);
 							return (KERN_INVALID_ADDRESS);
 						}
 						end = saved_start;
 						rv = KERN_INVALID_ADDRESS;
 						goto done;
 					}
 				}
 				if (entry == first_entry)
 					first_entry = tmp_entry;
 				else
 					first_entry = NULL;
 				entry = tmp_entry;
 			}
 			last_timestamp = map->timestamp;
 			continue;
 		}
 		vm_map_clip_start(map, entry, start);
 		vm_map_clip_end(map, entry, end);
 		/*
 		 * Mark the entry in case the map lock is released.  (See
 		 * above.)
 		 */
 		entry->eflags |= MAP_ENTRY_IN_TRANSITION;
 		if ((entry->protection & (VM_PROT_READ | VM_PROT_EXECUTE)) == 0
 		    || (entry->protection & prot) != prot) {
 			entry->eflags |= MAP_ENTRY_WIRE_SKIPPED;
 			if ((flags & VM_MAP_WIRE_HOLESOK) == 0) {
 				end = entry->end;
 				rv = KERN_INVALID_ADDRESS;
 				goto done;
 			}
 			goto next_entry;
 		}
 		if (entry->wired_count == 0) {
 			entry->wired_count++;
 			saved_start = entry->start;
 			saved_end = entry->end;
 			fictitious = entry->object.vm_object != NULL &&
 			    (entry->object.vm_object->flags &
 			    OBJ_FICTITIOUS) != 0;
 			/*
 			 * Release the map lock, relying on the in-transition
 			 * mark.  Mark the map busy for fork.
 			 */
 			vm_map_busy(map);
 			vm_map_unlock(map);
 			rv = vm_fault_wire(map, saved_start, saved_end,
 			    fictitious);
 			vm_map_lock(map);
 			vm_map_unbusy(map);
 			if (last_timestamp + 1 != map->timestamp) {
 				/*
 				 * Look again for the entry because the map was
 				 * modified while it was unlocked.  The entry
 				 * may have been clipped, but NOT merged or
 				 * deleted.
 				 */
 				result = vm_map_lookup_entry(map, saved_start,
 				    &tmp_entry);
 				KASSERT(result, ("vm_map_wire: lookup failed"));
 				if (entry == first_entry)
 					first_entry = tmp_entry;
 				else
 					first_entry = NULL;
 				entry = tmp_entry;
 				while (entry->end < saved_end) {
 					if (rv != KERN_SUCCESS) {
 						KASSERT(entry->wired_count == 1,
 						    ("vm_map_wire: bad count"));
 						entry->wired_count = -1;
 					}
 					entry = entry->next;
 				}
 			}
 			last_timestamp = map->timestamp;
 			if (rv != KERN_SUCCESS) {
 				KASSERT(entry->wired_count == 1,
 				    ("vm_map_wire: bad count"));
 				/*
 				 * Assign an out-of-range value to represent
 				 * the failure to wire this entry.
 				 */
 				entry->wired_count = -1;
 				end = entry->end;
 				goto done;
 			}
 		} else if (!user_wire ||
 			   (entry->eflags & MAP_ENTRY_USER_WIRED) == 0) {
 			entry->wired_count++;
 		}
 		/*
 		 * Check the map for holes in the specified region.
 		 * If VM_MAP_WIRE_HOLESOK was specified, skip this check.
 		 */
 	next_entry:
 		if (((flags & VM_MAP_WIRE_HOLESOK) == 0) &&
 		    (entry->end < end && (entry->next == &map->header ||
 		    entry->next->start > entry->end))) {
 			end = entry->end;
 			rv = KERN_INVALID_ADDRESS;
 			goto done;
 		}
 		entry = entry->next;
 	}
 	rv = KERN_SUCCESS;
 done:
 	need_wakeup = FALSE;
 	if (first_entry == NULL) {
 		result = vm_map_lookup_entry(map, start, &first_entry);
 		if (!result && (flags & VM_MAP_WIRE_HOLESOK))
 			first_entry = first_entry->next;
 		else
 			KASSERT(result, ("vm_map_wire: lookup failed"));
 	}
 	entry = first_entry;
 	while (entry != &map->header && entry->start < end) {
 		if ((entry->eflags & MAP_ENTRY_WIRE_SKIPPED) != 0)
 			goto next_entry_done;
 		if (rv == KERN_SUCCESS) {
 			if (user_wire)
 				entry->eflags |= MAP_ENTRY_USER_WIRED;
 		} else if (entry->wired_count == -1) {
 			/*
 			 * Wiring failed on this entry.  Thus, unwiring is
 			 * unnecessary.
 			 */
 			entry->wired_count = 0;
 		} else {
 			if (!user_wire ||
 			    (entry->eflags & MAP_ENTRY_USER_WIRED) == 0)
 				entry->wired_count--;
 			if (entry->wired_count == 0) {
 				/*
 				 * Retain the map lock.
 				 */
 				vm_fault_unwire(map, entry->start, entry->end,
 				    entry->object.vm_object != NULL &&
 				    (entry->object.vm_object->flags &
 				    OBJ_FICTITIOUS) != 0);
 			}
 		}
 	next_entry_done:
 		KASSERT(entry->eflags & MAP_ENTRY_IN_TRANSITION,
 			("vm_map_wire: in-transition flag missing"));
 		entry->eflags &= ~(MAP_ENTRY_IN_TRANSITION|MAP_ENTRY_WIRE_SKIPPED);
 		if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) {
 			entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP;
 			need_wakeup = TRUE;
 		}
 		vm_map_simplify_entry(map, entry);
 		entry = entry->next;
 	}
 	vm_map_unlock(map);
 	if (need_wakeup)
 		vm_map_wakeup(map);
 	return (rv);
 }
 
 /*
  * vm_map_sync
  *
  * Push any dirty cached pages in the address range to their pager.
  * If syncio is TRUE, dirty pages are written synchronously.
  * If invalidate is TRUE, any cached pages are freed as well.
  *
  * If the size of the region from start to end is zero, we are
  * supposed to flush all modified pages within the region containing
  * start.  Unfortunately, a region can be split or coalesced with
  * neighboring regions, making it difficult to determine what the
  * original region was.  Therefore, we approximate this requirement by
  * flushing the current region containing start.
  *
  * Returns an error if any part of the specified range is not mapped.
  */
 int
 vm_map_sync(
 	vm_map_t map,
 	vm_offset_t start,
 	vm_offset_t end,
 	boolean_t syncio,
 	boolean_t invalidate)
 {
 	vm_map_entry_t current;
 	vm_map_entry_t entry;
 	vm_size_t size;
 	vm_object_t object;
 	vm_ooffset_t offset;
 	unsigned int last_timestamp;
 	boolean_t failed;
 
 	vm_map_lock_read(map);
 	VM_MAP_RANGE_CHECK(map, start, end);
 	if (!vm_map_lookup_entry(map, start, &entry)) {
 		vm_map_unlock_read(map);
 		return (KERN_INVALID_ADDRESS);
 	} else if (start == end) {
 		start = entry->start;
 		end = entry->end;
 	}
 	/*
 	 * Make a first pass to check for user-wired memory and holes.
 	 */
 	for (current = entry; current != &map->header && current->start < end;
 	    current = current->next) {
 		if (invalidate && (current->eflags & MAP_ENTRY_USER_WIRED)) {
 			vm_map_unlock_read(map);
 			return (KERN_INVALID_ARGUMENT);
 		}
 		if (end > current->end &&
 		    (current->next == &map->header ||
 			current->end != current->next->start)) {
 			vm_map_unlock_read(map);
 			return (KERN_INVALID_ADDRESS);
 		}
 	}
 
 	if (invalidate)
 		pmap_remove(map->pmap, start, end);
 	failed = FALSE;
 
 	/*
 	 * Make a second pass, cleaning/uncaching pages from the indicated
 	 * objects as we go.
 	 */
 	for (current = entry; current != &map->header && current->start < end;) {
 		offset = current->offset + (start - current->start);
 		size = (end <= current->end ? end : current->end) - start;
 		if (current->eflags & MAP_ENTRY_IS_SUB_MAP) {
 			vm_map_t smap;
 			vm_map_entry_t tentry;
 			vm_size_t tsize;
 
 			smap = current->object.sub_map;
 			vm_map_lock_read(smap);
 			(void) vm_map_lookup_entry(smap, offset, &tentry);
 			tsize = tentry->end - offset;
 			if (tsize < size)
 				size = tsize;
 			object = tentry->object.vm_object;
 			offset = tentry->offset + (offset - tentry->start);
 			vm_map_unlock_read(smap);
 		} else {
 			object = current->object.vm_object;
 		}
 		vm_object_reference(object);
 		last_timestamp = map->timestamp;
 		vm_map_unlock_read(map);
 		if (!vm_object_sync(object, offset, size, syncio, invalidate))
 			failed = TRUE;
 		start += size;
 		vm_object_deallocate(object);
 		vm_map_lock_read(map);
 		if (last_timestamp == map->timestamp ||
 		    !vm_map_lookup_entry(map, start, &current))
 			current = current->next;
 	}
 
 	vm_map_unlock_read(map);
 	return (failed ? KERN_FAILURE : KERN_SUCCESS);
 }
 
 /*
  *	vm_map_entry_unwire:	[ internal use only ]
  *
  *	Make the region specified by this entry pageable.
  *
  *	The map in question should be locked.
  *	[This is the reason for this routine's existence.]
  */
 static void
 vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry)
 {
 	vm_fault_unwire(map, entry->start, entry->end,
 	    entry->object.vm_object != NULL &&
 	    (entry->object.vm_object->flags & OBJ_FICTITIOUS) != 0);
 	entry->wired_count = 0;
 }
 
 static void
 vm_map_entry_deallocate(vm_map_entry_t entry, boolean_t system_map)
 {
 
 	if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0)
 		vm_object_deallocate(entry->object.vm_object);
 	uma_zfree(system_map ? kmapentzone : mapentzone, entry);
 }
 
 /*
  *	vm_map_entry_delete:	[ internal use only ]
  *
  *	Deallocate the given entry from the target map.
  */
 static void
 vm_map_entry_delete(vm_map_t map, vm_map_entry_t entry)
 {
 	vm_object_t object;
 	vm_pindex_t offidxstart, offidxend, count, size1;
 	vm_ooffset_t size;
 
 	vm_map_entry_unlink(map, entry);
 	object = entry->object.vm_object;
 	size = entry->end - entry->start;
 	map->size -= size;
 
 	if (entry->cred != NULL) {
 		swap_release_by_cred(size, entry->cred);
 		crfree(entry->cred);
 	}
 
 	if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0 &&
 	    (object != NULL)) {
 		KASSERT(entry->cred == NULL || object->cred == NULL ||
 		    (entry->eflags & MAP_ENTRY_NEEDS_COPY),
 		    ("OVERCOMMIT vm_map_entry_delete: both cred %p", entry));
 		count = OFF_TO_IDX(size);
 		offidxstart = OFF_TO_IDX(entry->offset);
 		offidxend = offidxstart + count;
 		VM_OBJECT_LOCK(object);
 		if (object->ref_count != 1 &&
 		    ((object->flags & (OBJ_NOSPLIT|OBJ_ONEMAPPING)) == OBJ_ONEMAPPING ||
 		    object == kernel_object || object == kmem_object)) {
 			vm_object_collapse(object);
 
 			/*
 			 * The option OBJPR_NOTMAPPED can be passed here
 			 * because vm_map_delete() already performed
 			 * pmap_remove() on the only mapping to this range
 			 * of pages. 
 			 */
 			vm_object_page_remove(object, offidxstart, offidxend,
 			    OBJPR_NOTMAPPED);
 			if (object->type == OBJT_SWAP)
 				swap_pager_freespace(object, offidxstart, count);
 			if (offidxend >= object->size &&
 			    offidxstart < object->size) {
 				size1 = object->size;
 				object->size = offidxstart;
 				if (object->cred != NULL) {
 					size1 -= object->size;
 					KASSERT(object->charge >= ptoa(size1),
 					    ("vm_map_entry_delete: object->charge < 0"));
 					swap_release_by_cred(ptoa(size1), object->cred);
 					object->charge -= ptoa(size1);
 				}
 			}
 		}
 		VM_OBJECT_UNLOCK(object);
 	} else
 		entry->object.vm_object = NULL;
 	if (map->system_map)
 		vm_map_entry_deallocate(entry, TRUE);
 	else {
 		entry->next = curthread->td_map_def_user;
 		curthread->td_map_def_user = entry;
 	}
 }
 
 /*
  *	vm_map_delete:	[ internal use only ]
  *
  *	Deallocates the given address range from the target
  *	map.
  */
 int
 vm_map_delete(vm_map_t map, vm_offset_t start, vm_offset_t end)
 {
 	vm_map_entry_t entry;
 	vm_map_entry_t first_entry;
 
 	VM_MAP_ASSERT_LOCKED(map);
 
 	/*
 	 * Find the start of the region, and clip it
 	 */
 	if (!vm_map_lookup_entry(map, start, &first_entry))
 		entry = first_entry->next;
 	else {
 		entry = first_entry;
 		vm_map_clip_start(map, entry, start);
 	}
 
 	/*
 	 * Step through all entries in this region
 	 */
 	while ((entry != &map->header) && (entry->start < end)) {
 		vm_map_entry_t next;
 
 		/*
 		 * Wait for wiring or unwiring of an entry to complete.
 		 * Also wait for any system wirings to disappear on
 		 * user maps.
 		 */
 		if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0 ||
 		    (vm_map_pmap(map) != kernel_pmap &&
 		    vm_map_entry_system_wired_count(entry) != 0)) {
 			unsigned int last_timestamp;
 			vm_offset_t saved_start;
 			vm_map_entry_t tmp_entry;
 
 			saved_start = entry->start;
 			entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
 			last_timestamp = map->timestamp;
 			(void) vm_map_unlock_and_wait(map, 0);
 			vm_map_lock(map);
 			if (last_timestamp + 1 != map->timestamp) {
 				/*
 				 * Look again for the entry because the map was
 				 * modified while it was unlocked.
 				 * Specifically, the entry may have been
 				 * clipped, merged, or deleted.
 				 */
 				if (!vm_map_lookup_entry(map, saved_start,
 							 &tmp_entry))
 					entry = tmp_entry->next;
 				else {
 					entry = tmp_entry;
 					vm_map_clip_start(map, entry,
 							  saved_start);
 				}
 			}
 			continue;
 		}
 		vm_map_clip_end(map, entry, end);
 
 		next = entry->next;
 
 		/*
 		 * Unwire before removing addresses from the pmap; otherwise,
 		 * unwiring will put the entries back in the pmap.
 		 */
 		if (entry->wired_count != 0) {
 			vm_map_entry_unwire(map, entry);
 		}
 
 		pmap_remove(map->pmap, entry->start, entry->end);
 
 		/*
 		 * Delete the entry only after removing all pmap
 		 * entries pointing to its pages.  (Otherwise, its
 		 * page frames may be reallocated, and any modify bits
 		 * will be set in the wrong object!)
 		 */
 		vm_map_entry_delete(map, entry);
 		entry = next;
 	}
 	return (KERN_SUCCESS);
 }
 
 /*
  *	vm_map_remove:
  *
  *	Remove the given address range from the target map.
  *	This is the exported form of vm_map_delete.
  */
 int
 vm_map_remove(vm_map_t map, vm_offset_t start, vm_offset_t end)
 {
 	int result;
 
 	vm_map_lock(map);
 	VM_MAP_RANGE_CHECK(map, start, end);
 	result = vm_map_delete(map, start, end);
 	vm_map_unlock(map);
 	return (result);
 }
 
 /*
  *	vm_map_check_protection:
  *
  *	Assert that the target map allows the specified privilege on the
  *	entire address region given.  The entire region must be allocated.
  *
  *	WARNING!  This code does not and should not check whether the
  *	contents of the region is accessible.  For example a smaller file
  *	might be mapped into a larger address space.
  *
  *	NOTE!  This code is also called by munmap().
  *
  *	The map must be locked.  A read lock is sufficient.
  */
 boolean_t
 vm_map_check_protection(vm_map_t map, vm_offset_t start, vm_offset_t end,
 			vm_prot_t protection)
 {
 	vm_map_entry_t entry;
 	vm_map_entry_t tmp_entry;
 
 	if (!vm_map_lookup_entry(map, start, &tmp_entry))
 		return (FALSE);
 	entry = tmp_entry;
 
 	while (start < end) {
 		if (entry == &map->header)
 			return (FALSE);
 		/*
 		 * No holes allowed!
 		 */
 		if (start < entry->start)
 			return (FALSE);
 		/*
 		 * Check protection associated with entry.
 		 */
 		if ((entry->protection & protection) != protection)
 			return (FALSE);
 		/* go to next entry */
 		start = entry->end;
 		entry = entry->next;
 	}
 	return (TRUE);
 }
 
 /*
  *	vm_map_copy_entry:
  *
  *	Copies the contents of the source entry to the destination
  *	entry.  The entries *must* be aligned properly.
  */
 static void
 vm_map_copy_entry(
 	vm_map_t src_map,
 	vm_map_t dst_map,
 	vm_map_entry_t src_entry,
 	vm_map_entry_t dst_entry,
 	vm_ooffset_t *fork_charge)
 {
 	vm_object_t src_object;
 	vm_map_entry_t fake_entry;
 	vm_offset_t size;
 	struct ucred *cred;
 	int charged;
 
 	VM_MAP_ASSERT_LOCKED(dst_map);
 
 	if ((dst_entry->eflags|src_entry->eflags) & MAP_ENTRY_IS_SUB_MAP)
 		return;
 
 	if (src_entry->wired_count == 0) {
 
 		/*
 		 * If the source entry is marked needs_copy, it is already
 		 * write-protected.
 		 */
 		if ((src_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) {
 			pmap_protect(src_map->pmap,
 			    src_entry->start,
 			    src_entry->end,
 			    src_entry->protection & ~VM_PROT_WRITE);
 		}
 
 		/*
 		 * Make a copy of the object.
 		 */
 		size = src_entry->end - src_entry->start;
 		if ((src_object = src_entry->object.vm_object) != NULL) {
 			VM_OBJECT_LOCK(src_object);
 			charged = ENTRY_CHARGED(src_entry);
 			if ((src_object->handle == NULL) &&
 				(src_object->type == OBJT_DEFAULT ||
 				 src_object->type == OBJT_SWAP)) {
 				vm_object_collapse(src_object);
 				if ((src_object->flags & (OBJ_NOSPLIT|OBJ_ONEMAPPING)) == OBJ_ONEMAPPING) {
 					vm_object_split(src_entry);
 					src_object = src_entry->object.vm_object;
 				}
 			}
 			vm_object_reference_locked(src_object);
 			vm_object_clear_flag(src_object, OBJ_ONEMAPPING);
 			if (src_entry->cred != NULL &&
 			    !(src_entry->eflags & MAP_ENTRY_NEEDS_COPY)) {
 				KASSERT(src_object->cred == NULL,
 				    ("OVERCOMMIT: vm_map_copy_entry: cred %p",
 				     src_object));
 				src_object->cred = src_entry->cred;
 				src_object->charge = size;
 			}
 			VM_OBJECT_UNLOCK(src_object);
 			dst_entry->object.vm_object = src_object;
 			if (charged) {
 				cred = curthread->td_ucred;
 				crhold(cred);
 				dst_entry->cred = cred;
 				*fork_charge += size;
 				if (!(src_entry->eflags &
 				      MAP_ENTRY_NEEDS_COPY)) {
 					crhold(cred);
 					src_entry->cred = cred;
 					*fork_charge += size;
 				}
 			}
 			src_entry->eflags |= (MAP_ENTRY_COW|MAP_ENTRY_NEEDS_COPY);
 			dst_entry->eflags |= (MAP_ENTRY_COW|MAP_ENTRY_NEEDS_COPY);
 			dst_entry->offset = src_entry->offset;
 			if (src_entry->eflags & MAP_ENTRY_VN_WRITECNT) {
 				/*
 				 * MAP_ENTRY_VN_WRITECNT cannot
 				 * indicate write reference from
 				 * src_entry, since the entry is
 				 * marked as needs copy.  Allocate a
 				 * fake entry that is used to
 				 * decrement object->un_pager.vnp.writecount
 				 * at the appropriate time.  Attach
 				 * fake_entry to the deferred list.
 				 */
 				fake_entry = vm_map_entry_create(dst_map);
 				fake_entry->eflags = MAP_ENTRY_VN_WRITECNT;
 				src_entry->eflags &= ~MAP_ENTRY_VN_WRITECNT;
 				vm_object_reference(src_object);
 				fake_entry->object.vm_object = src_object;
 				fake_entry->start = src_entry->start;
 				fake_entry->end = src_entry->end;
 				fake_entry->next = curthread->td_map_def_user;
 				curthread->td_map_def_user = fake_entry;
 			}
 		} else {
 			dst_entry->object.vm_object = NULL;
 			dst_entry->offset = 0;
 			if (src_entry->cred != NULL) {
 				dst_entry->cred = curthread->td_ucred;
 				crhold(dst_entry->cred);
 				*fork_charge += size;
 			}
 		}
 
 		pmap_copy(dst_map->pmap, src_map->pmap, dst_entry->start,
 		    dst_entry->end - dst_entry->start, src_entry->start);
 	} else {
 		/*
 		 * Of course, wired down pages can't be set copy-on-write.
 		 * Cause wired pages to be copied into the new map by
 		 * simulating faults (the new pages are pageable)
 		 */
 		vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry,
 		    fork_charge);
 	}
 }
 
 /*
  * vmspace_map_entry_forked:
  * Update the newly-forked vmspace each time a map entry is inherited
  * or copied.  The values for vm_dsize and vm_tsize are approximate
  * (and mostly-obsolete ideas in the face of mmap(2) et al.)
  */
 static void
 vmspace_map_entry_forked(const struct vmspace *vm1, struct vmspace *vm2,
     vm_map_entry_t entry)
 {
 	vm_size_t entrysize;
 	vm_offset_t newend;
 
 	entrysize = entry->end - entry->start;
 	vm2->vm_map.size += entrysize;
 	if (entry->eflags & (MAP_ENTRY_GROWS_DOWN | MAP_ENTRY_GROWS_UP)) {
 		vm2->vm_ssize += btoc(entrysize);
 	} else if (entry->start >= (vm_offset_t)vm1->vm_daddr &&
 	    entry->start < (vm_offset_t)vm1->vm_daddr + ctob(vm1->vm_dsize)) {
 		newend = MIN(entry->end,
 		    (vm_offset_t)vm1->vm_daddr + ctob(vm1->vm_dsize));
 		vm2->vm_dsize += btoc(newend - entry->start);
 	} else if (entry->start >= (vm_offset_t)vm1->vm_taddr &&
 	    entry->start < (vm_offset_t)vm1->vm_taddr + ctob(vm1->vm_tsize)) {
 		newend = MIN(entry->end,
 		    (vm_offset_t)vm1->vm_taddr + ctob(vm1->vm_tsize));
 		vm2->vm_tsize += btoc(newend - entry->start);
 	}
 }
 
 /*
  * vmspace_fork:
  * Create a new process vmspace structure and vm_map
  * based on those of an existing process.  The new map
  * is based on the old map, according to the inheritance
  * values on the regions in that map.
  *
  * XXX It might be worth coalescing the entries added to the new vmspace.
  *
  * The source map must not be locked.
  */
 struct vmspace *
 vmspace_fork(struct vmspace *vm1, vm_ooffset_t *fork_charge)
 {
 	struct vmspace *vm2;
 	vm_map_t new_map, old_map;
 	vm_map_entry_t new_entry, old_entry;
 	vm_object_t object;
 	int locked;
 
 	old_map = &vm1->vm_map;
 	/* Copy immutable fields of vm1 to vm2. */
 	vm2 = vmspace_alloc(old_map->min_offset, old_map->max_offset);
 	if (vm2 == NULL)
 		return (NULL);
 	vm2->vm_taddr = vm1->vm_taddr;
 	vm2->vm_daddr = vm1->vm_daddr;
 	vm2->vm_maxsaddr = vm1->vm_maxsaddr;
 	vm_map_lock(old_map);
 	if (old_map->busy)
 		vm_map_wait_busy(old_map);
 	new_map = &vm2->vm_map;
 	locked = vm_map_trylock(new_map); /* trylock to silence WITNESS */
 	KASSERT(locked, ("vmspace_fork: lock failed"));
 
 	old_entry = old_map->header.next;
 
 	while (old_entry != &old_map->header) {
 		if (old_entry->eflags & MAP_ENTRY_IS_SUB_MAP)
 			panic("vm_map_fork: encountered a submap");
 
 		switch (old_entry->inheritance) {
 		case VM_INHERIT_NONE:
 			break;
 
 		case VM_INHERIT_SHARE:
 			/*
 			 * Clone the entry, creating the shared object if necessary.
 			 */
 			object = old_entry->object.vm_object;
 			if (object == NULL) {
 				object = vm_object_allocate(OBJT_DEFAULT,
 					atop(old_entry->end - old_entry->start));
 				old_entry->object.vm_object = object;
 				old_entry->offset = 0;
 				if (old_entry->cred != NULL) {
 					object->cred = old_entry->cred;
 					object->charge = old_entry->end -
 					    old_entry->start;
 					old_entry->cred = NULL;
 				}
 			}
 
 			/*
 			 * Add the reference before calling vm_object_shadow
 			 * to insure that a shadow object is created.
 			 */
 			vm_object_reference(object);
 			if (old_entry->eflags & MAP_ENTRY_NEEDS_COPY) {
 				vm_object_shadow(&old_entry->object.vm_object,
 				    &old_entry->offset,
 				    old_entry->end - old_entry->start);
 				old_entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
 				/* Transfer the second reference too. */
 				vm_object_reference(
 				    old_entry->object.vm_object);
 
 				/*
 				 * As in vm_map_simplify_entry(), the
 				 * vnode lock will not be acquired in
 				 * this call to vm_object_deallocate().
 				 */
 				vm_object_deallocate(object);
 				object = old_entry->object.vm_object;
 			}
 			VM_OBJECT_LOCK(object);
 			vm_object_clear_flag(object, OBJ_ONEMAPPING);
 			if (old_entry->cred != NULL) {
 				KASSERT(object->cred == NULL, ("vmspace_fork both cred"));
 				object->cred = old_entry->cred;
 				object->charge = old_entry->end - old_entry->start;
 				old_entry->cred = NULL;
 			}
 			VM_OBJECT_UNLOCK(object);
 
 			/*
 			 * Clone the entry, referencing the shared object.
 			 */
 			new_entry = vm_map_entry_create(new_map);
 			*new_entry = *old_entry;
 			new_entry->eflags &= ~(MAP_ENTRY_USER_WIRED |
 			    MAP_ENTRY_IN_TRANSITION);
 			new_entry->wired_count = 0;
 			if (new_entry->eflags & MAP_ENTRY_VN_WRITECNT) {
 				object = new_entry->object.vm_object;
 				KASSERT(((struct vnode *)object->handle)->
 				    v_writecount > 0,
 				    ("vmspace_fork: v_writecount"));
 				KASSERT(object->un_pager.vnp.writemappings > 0,
 				    ("vmspace_fork: vnp.writecount"));
 				vnode_pager_update_writecount(object,
 				    new_entry->start, new_entry->end);
 			}
 
 			/*
 			 * Insert the entry into the new map -- we know we're
 			 * inserting at the end of the new map.
 			 */
 			vm_map_entry_link(new_map, new_map->header.prev,
 			    new_entry);
 			vmspace_map_entry_forked(vm1, vm2, new_entry);
 
 			/*
 			 * Update the physical map
 			 */
 			pmap_copy(new_map->pmap, old_map->pmap,
 			    new_entry->start,
 			    (old_entry->end - old_entry->start),
 			    old_entry->start);
 			break;
 
 		case VM_INHERIT_COPY:
 			/*
 			 * Clone the entry and link into the map.
 			 */
 			new_entry = vm_map_entry_create(new_map);
 			*new_entry = *old_entry;
 			/*
 			 * Copied entry is COW over the old object.
 			 */
 			new_entry->eflags &= ~(MAP_ENTRY_USER_WIRED |
 			    MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_VN_WRITECNT);
 			new_entry->wired_count = 0;
 			new_entry->object.vm_object = NULL;
 			new_entry->cred = NULL;
 			vm_map_entry_link(new_map, new_map->header.prev,
 			    new_entry);
 			vmspace_map_entry_forked(vm1, vm2, new_entry);
 			vm_map_copy_entry(old_map, new_map, old_entry,
 			    new_entry, fork_charge);
 			break;
 		}
 		old_entry = old_entry->next;
 	}
 	/*
 	 * Use inlined vm_map_unlock() to postpone handling the deferred
 	 * map entries, which cannot be done until both old_map and
 	 * new_map locks are released.
 	 */
 	sx_xunlock(&old_map->lock);
 	sx_xunlock(&new_map->lock);
 	vm_map_process_deferred();
 
 	return (vm2);
 }
 
 int
 vm_map_stack(vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize,
     vm_prot_t prot, vm_prot_t max, int cow)
 {
 	vm_map_entry_t new_entry, prev_entry;
 	vm_offset_t bot, top;
 	vm_size_t growsize, init_ssize;
 	int orient, rv;
 	rlim_t lmemlim, vmemlim;
 
 	/*
 	 * The stack orientation is piggybacked with the cow argument.
 	 * Extract it into orient and mask the cow argument so that we
 	 * don't pass it around further.
 	 * NOTE: We explicitly allow bi-directional stacks.
 	 */
 	orient = cow & (MAP_STACK_GROWS_DOWN|MAP_STACK_GROWS_UP);
 	cow &= ~orient;
 	KASSERT(orient != 0, ("No stack grow direction"));
 
 	if (addrbos < vm_map_min(map) ||
 	    addrbos > vm_map_max(map) ||
 	    addrbos + max_ssize < addrbos)
 		return (KERN_NO_SPACE);
 
 	growsize = sgrowsiz;
 	init_ssize = (max_ssize < growsize) ? max_ssize : growsize;
 
 	PROC_LOCK(curproc);
 	lmemlim = lim_cur(curproc, RLIMIT_MEMLOCK);
 	vmemlim = lim_cur(curproc, RLIMIT_VMEM);
 	PROC_UNLOCK(curproc);
 
 	vm_map_lock(map);
 
 	/* If addr is already mapped, no go */
 	if (vm_map_lookup_entry(map, addrbos, &prev_entry)) {
 		vm_map_unlock(map);
 		return (KERN_NO_SPACE);
 	}
 
 	if (!old_mlock && map->flags & MAP_WIREFUTURE) {
 		if (ptoa(pmap_wired_count(map->pmap)) + init_ssize > lmemlim) {
 			vm_map_unlock(map);
 			return (KERN_NO_SPACE);
 		}
 	}
 
 	/* If we would blow our VMEM resource limit, no go */
 	if (map->size + init_ssize > vmemlim) {
 		vm_map_unlock(map);
 		return (KERN_NO_SPACE);
 	}
 
 	/*
 	 * If we can't accomodate max_ssize in the current mapping, no go.
 	 * However, we need to be aware that subsequent user mappings might
 	 * map into the space we have reserved for stack, and currently this
 	 * space is not protected.
 	 *
 	 * Hopefully we will at least detect this condition when we try to
 	 * grow the stack.
 	 */
 	if ((prev_entry->next != &map->header) &&
 	    (prev_entry->next->start < addrbos + max_ssize)) {
 		vm_map_unlock(map);
 		return (KERN_NO_SPACE);
 	}
 
 	/*
 	 * We initially map a stack of only init_ssize.  We will grow as
 	 * needed later.  Depending on the orientation of the stack (i.e.
 	 * the grow direction) we either map at the top of the range, the
 	 * bottom of the range or in the middle.
 	 *
 	 * Note: we would normally expect prot and max to be VM_PROT_ALL,
 	 * and cow to be 0.  Possibly we should eliminate these as input
 	 * parameters, and just pass these values here in the insert call.
 	 */
 	if (orient == MAP_STACK_GROWS_DOWN)
 		bot = addrbos + max_ssize - init_ssize;
 	else if (orient == MAP_STACK_GROWS_UP)
 		bot = addrbos;
 	else
 		bot = round_page(addrbos + max_ssize/2 - init_ssize/2);
 	top = bot + init_ssize;
 	rv = vm_map_insert(map, NULL, 0, bot, top, prot, max, cow);
 
 	/* Now set the avail_ssize amount. */
 	if (rv == KERN_SUCCESS) {
 		if (prev_entry != &map->header)
 			vm_map_clip_end(map, prev_entry, bot);
 		new_entry = prev_entry->next;
 		if (new_entry->end != top || new_entry->start != bot)
 			panic("Bad entry start/end for new stack entry");
 
 		new_entry->avail_ssize = max_ssize - init_ssize;
 		if (orient & MAP_STACK_GROWS_DOWN)
 			new_entry->eflags |= MAP_ENTRY_GROWS_DOWN;
 		if (orient & MAP_STACK_GROWS_UP)
 			new_entry->eflags |= MAP_ENTRY_GROWS_UP;
 	}
 
 	vm_map_unlock(map);
 	return (rv);
 }
 
 static int stack_guard_page = 0;
 TUNABLE_INT("security.bsd.stack_guard_page", &stack_guard_page);
 SYSCTL_INT(_security_bsd, OID_AUTO, stack_guard_page, CTLFLAG_RW,
     &stack_guard_page, 0,
     "Insert stack guard page ahead of the growable segments.");
 
 /* Attempts to grow a vm stack entry.  Returns KERN_SUCCESS if the
  * desired address is already mapped, or if we successfully grow
  * the stack.  Also returns KERN_SUCCESS if addr is outside the
  * stack range (this is strange, but preserves compatibility with
  * the grow function in vm_machdep.c).
  */
 int
 vm_map_growstack(struct proc *p, vm_offset_t addr)
 {
 	vm_map_entry_t next_entry, prev_entry;
 	vm_map_entry_t new_entry, stack_entry;
 	struct vmspace *vm = p->p_vmspace;
 	vm_map_t map = &vm->vm_map;
 	vm_offset_t end;
 	vm_size_t growsize;
 	size_t grow_amount, max_grow;
 	rlim_t lmemlim, stacklim, vmemlim;
 	int is_procstack, rv;
 	struct ucred *cred;
 #ifdef notyet
 	uint64_t limit;
 #endif
 #ifdef RACCT
 	int error;
 #endif
 
 Retry:
 	PROC_LOCK(p);
 	lmemlim = lim_cur(p, RLIMIT_MEMLOCK);
 	stacklim = lim_cur(p, RLIMIT_STACK);
 	vmemlim = lim_cur(p, RLIMIT_VMEM);
 	PROC_UNLOCK(p);
 
 	vm_map_lock_read(map);
 
 	/* If addr is already in the entry range, no need to grow.*/
 	if (vm_map_lookup_entry(map, addr, &prev_entry)) {
 		vm_map_unlock_read(map);
 		return (KERN_SUCCESS);
 	}
 
 	next_entry = prev_entry->next;
 	if (!(prev_entry->eflags & MAP_ENTRY_GROWS_UP)) {
 		/*
 		 * This entry does not grow upwards. Since the address lies
 		 * beyond this entry, the next entry (if one exists) has to
 		 * be a downward growable entry. The entry list header is
 		 * never a growable entry, so it suffices to check the flags.
 		 */
 		if (!(next_entry->eflags & MAP_ENTRY_GROWS_DOWN)) {
 			vm_map_unlock_read(map);
 			return (KERN_SUCCESS);
 		}
 		stack_entry = next_entry;
 	} else {
 		/*
 		 * This entry grows upward. If the next entry does not at
 		 * least grow downwards, this is the entry we need to grow.
 		 * otherwise we have two possible choices and we have to
 		 * select one.
 		 */
 		if (next_entry->eflags & MAP_ENTRY_GROWS_DOWN) {
 			/*
 			 * We have two choices; grow the entry closest to
 			 * the address to minimize the amount of growth.
 			 */
 			if (addr - prev_entry->end <= next_entry->start - addr)
 				stack_entry = prev_entry;
 			else
 				stack_entry = next_entry;
 		} else
 			stack_entry = prev_entry;
 	}
 
 	if (stack_entry == next_entry) {
 		KASSERT(stack_entry->eflags & MAP_ENTRY_GROWS_DOWN, ("foo"));
 		KASSERT(addr < stack_entry->start, ("foo"));
 		end = (prev_entry != &map->header) ? prev_entry->end :
 		    stack_entry->start - stack_entry->avail_ssize;
 		grow_amount = roundup(stack_entry->start - addr, PAGE_SIZE);
 		max_grow = stack_entry->start - end;
 	} else {
 		KASSERT(stack_entry->eflags & MAP_ENTRY_GROWS_UP, ("foo"));
 		KASSERT(addr >= stack_entry->end, ("foo"));
 		end = (next_entry != &map->header) ? next_entry->start :
 		    stack_entry->end + stack_entry->avail_ssize;
 		grow_amount = roundup(addr + 1 - stack_entry->end, PAGE_SIZE);
 		max_grow = end - stack_entry->end;
 	}
 
 	if (grow_amount > stack_entry->avail_ssize) {
 		vm_map_unlock_read(map);
 		return (KERN_NO_SPACE);
 	}
 
 	/*
 	 * If there is no longer enough space between the entries nogo, and
 	 * adjust the available space.  Note: this  should only happen if the
 	 * user has mapped into the stack area after the stack was created,
 	 * and is probably an error.
 	 *
 	 * This also effectively destroys any guard page the user might have
 	 * intended by limiting the stack size.
 	 */
 	if (grow_amount + (stack_guard_page ? PAGE_SIZE : 0) > max_grow) {
 		if (vm_map_lock_upgrade(map))
 			goto Retry;
 
 		stack_entry->avail_ssize = max_grow;
 
 		vm_map_unlock(map);
 		return (KERN_NO_SPACE);
 	}
 
 	is_procstack = (addr >= (vm_offset_t)vm->vm_maxsaddr) ? 1 : 0;
 
 	/*
 	 * If this is the main process stack, see if we're over the stack
 	 * limit.
 	 */
 	if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > stacklim)) {
 		vm_map_unlock_read(map);
 		return (KERN_NO_SPACE);
 	}
 #ifdef RACCT
 	PROC_LOCK(p);
 	if (is_procstack &&
 	    racct_set(p, RACCT_STACK, ctob(vm->vm_ssize) + grow_amount)) {
 		PROC_UNLOCK(p);
 		vm_map_unlock_read(map);
 		return (KERN_NO_SPACE);
 	}
 	PROC_UNLOCK(p);
 #endif
 
 	/* Round up the grow amount modulo sgrowsiz */
 	growsize = sgrowsiz;
 	grow_amount = roundup(grow_amount, growsize);
 	if (grow_amount > stack_entry->avail_ssize)
 		grow_amount = stack_entry->avail_ssize;
 	if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > stacklim)) {
 		grow_amount = trunc_page((vm_size_t)stacklim) -
 		    ctob(vm->vm_ssize);
 	}
 #ifdef notyet
 	PROC_LOCK(p);
 	limit = racct_get_available(p, RACCT_STACK);
 	PROC_UNLOCK(p);
 	if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > limit))
 		grow_amount = limit - ctob(vm->vm_ssize);
 #endif
 	if (!old_mlock && map->flags & MAP_WIREFUTURE) {
 		if (ptoa(pmap_wired_count(map->pmap)) + grow_amount > lmemlim) {
 			vm_map_unlock_read(map);
 			rv = KERN_NO_SPACE;
 			goto out;
 		}
 #ifdef RACCT
 		PROC_LOCK(p);
 		if (racct_set(p, RACCT_MEMLOCK,
 		    ptoa(pmap_wired_count(map->pmap)) + grow_amount)) {
 			PROC_UNLOCK(p);
 			vm_map_unlock_read(map);
 			rv = KERN_NO_SPACE;
 			goto out;
 		}
 		PROC_UNLOCK(p);
 #endif
 	}
 	/* If we would blow our VMEM resource limit, no go */
 	if (map->size + grow_amount > vmemlim) {
 		vm_map_unlock_read(map);
 		rv = KERN_NO_SPACE;
 		goto out;
 	}
 #ifdef RACCT
 	PROC_LOCK(p);
 	if (racct_set(p, RACCT_VMEM, map->size + grow_amount)) {
 		PROC_UNLOCK(p);
 		vm_map_unlock_read(map);
 		rv = KERN_NO_SPACE;
 		goto out;
 	}
 	PROC_UNLOCK(p);
 #endif
 
 	if (vm_map_lock_upgrade(map))
 		goto Retry;
 
 	if (stack_entry == next_entry) {
 		/*
 		 * Growing downward.
 		 */
 		/* Get the preliminary new entry start value */
 		addr = stack_entry->start - grow_amount;
 
 		/*
 		 * If this puts us into the previous entry, cut back our
 		 * growth to the available space. Also, see the note above.
 		 */
 		if (addr < end) {
 			stack_entry->avail_ssize = max_grow;
 			addr = end;
 			if (stack_guard_page)
 				addr += PAGE_SIZE;
 		}
 
 		rv = vm_map_insert(map, NULL, 0, addr, stack_entry->start,
 		    next_entry->protection, next_entry->max_protection, 0);
 
 		/* Adjust the available stack space by the amount we grew. */
 		if (rv == KERN_SUCCESS) {
 			if (prev_entry != &map->header)
 				vm_map_clip_end(map, prev_entry, addr);
 			new_entry = prev_entry->next;
 			KASSERT(new_entry == stack_entry->prev, ("foo"));
 			KASSERT(new_entry->end == stack_entry->start, ("foo"));
 			KASSERT(new_entry->start == addr, ("foo"));
 			grow_amount = new_entry->end - new_entry->start;
 			new_entry->avail_ssize = stack_entry->avail_ssize -
 			    grow_amount;
 			stack_entry->eflags &= ~MAP_ENTRY_GROWS_DOWN;
 			new_entry->eflags |= MAP_ENTRY_GROWS_DOWN;
 		}
 	} else {
 		/*
 		 * Growing upward.
 		 */
 		addr = stack_entry->end + grow_amount;
 
 		/*
 		 * If this puts us into the next entry, cut back our growth
 		 * to the available space. Also, see the note above.
 		 */
 		if (addr > end) {
 			stack_entry->avail_ssize = end - stack_entry->end;
 			addr = end;
 			if (stack_guard_page)
 				addr -= PAGE_SIZE;
 		}
 
 		grow_amount = addr - stack_entry->end;
 		cred = stack_entry->cred;
 		if (cred == NULL && stack_entry->object.vm_object != NULL)
 			cred = stack_entry->object.vm_object->cred;
 		if (cred != NULL && !swap_reserve_by_cred(grow_amount, cred))
 			rv = KERN_NO_SPACE;
 		/* Grow the underlying object if applicable. */
 		else if (stack_entry->object.vm_object == NULL ||
 			 vm_object_coalesce(stack_entry->object.vm_object,
 			 stack_entry->offset,
 			 (vm_size_t)(stack_entry->end - stack_entry->start),
 			 (vm_size_t)grow_amount, cred != NULL)) {
 			map->size += (addr - stack_entry->end);
 			/* Update the current entry. */
 			stack_entry->end = addr;
 			stack_entry->avail_ssize -= grow_amount;
 			vm_map_entry_resize_free(map, stack_entry);
 			rv = KERN_SUCCESS;
 
 			if (next_entry != &map->header)
 				vm_map_clip_start(map, next_entry, addr);
 		} else
 			rv = KERN_FAILURE;
 	}
 
 	if (rv == KERN_SUCCESS && is_procstack)
 		vm->vm_ssize += btoc(grow_amount);
 
 	vm_map_unlock(map);
 
 	/*
 	 * Heed the MAP_WIREFUTURE flag if it was set for this process.
 	 */
 	if (rv == KERN_SUCCESS && (map->flags & MAP_WIREFUTURE)) {
 		vm_map_wire(map,
 		    (stack_entry == next_entry) ? addr : addr - grow_amount,
 		    (stack_entry == next_entry) ? stack_entry->start : addr,
 		    (p->p_flag & P_SYSTEM)
 		    ? VM_MAP_WIRE_SYSTEM|VM_MAP_WIRE_NOHOLES
 		    : VM_MAP_WIRE_USER|VM_MAP_WIRE_NOHOLES);
 	}
 
 out:
 #ifdef RACCT
 	if (rv != KERN_SUCCESS) {
 		PROC_LOCK(p);
 		error = racct_set(p, RACCT_VMEM, map->size);
 		KASSERT(error == 0, ("decreasing RACCT_VMEM failed"));
 		if (!old_mlock) {
 			error = racct_set(p, RACCT_MEMLOCK,
 			    ptoa(pmap_wired_count(map->pmap)));
 			KASSERT(error == 0, ("decreasing RACCT_MEMLOCK failed"));
 		}
 	    	error = racct_set(p, RACCT_STACK, ctob(vm->vm_ssize));
 		KASSERT(error == 0, ("decreasing RACCT_STACK failed"));
 		PROC_UNLOCK(p);
 	}
 #endif
 
 	return (rv);
 }
 
 /*
  * Unshare the specified VM space for exec.  If other processes are
  * mapped to it, then create a new one.  The new vmspace is null.
  */
 int
 vmspace_exec(struct proc *p, vm_offset_t minuser, vm_offset_t maxuser)
 {
 	struct vmspace *oldvmspace = p->p_vmspace;
 	struct vmspace *newvmspace;
 
 	newvmspace = vmspace_alloc(minuser, maxuser);
 	if (newvmspace == NULL)
 		return (ENOMEM);
 	newvmspace->vm_swrss = oldvmspace->vm_swrss;
 	/*
 	 * This code is written like this for prototype purposes.  The
 	 * goal is to avoid running down the vmspace here, but let the
 	 * other process's that are still using the vmspace to finally
 	 * run it down.  Even though there is little or no chance of blocking
 	 * here, it is a good idea to keep this form for future mods.
 	 */
 	PROC_VMSPACE_LOCK(p);
 	p->p_vmspace = newvmspace;
 	PROC_VMSPACE_UNLOCK(p);
 	if (p == curthread->td_proc)
 		pmap_activate(curthread);
 	vmspace_free(oldvmspace);
 	return (0);
 }
 
 /*
  * Unshare the specified VM space for forcing COW.  This
  * is called by rfork, for the (RFMEM|RFPROC) == 0 case.
  */
 int
 vmspace_unshare(struct proc *p)
 {
 	struct vmspace *oldvmspace = p->p_vmspace;
 	struct vmspace *newvmspace;
 	vm_ooffset_t fork_charge;
 
 	if (oldvmspace->vm_refcnt == 1)
 		return (0);
 	fork_charge = 0;
 	newvmspace = vmspace_fork(oldvmspace, &fork_charge);
 	if (newvmspace == NULL)
 		return (ENOMEM);
 	if (!swap_reserve_by_cred(fork_charge, p->p_ucred)) {
 		vmspace_free(newvmspace);
 		return (ENOMEM);
 	}
 	PROC_VMSPACE_LOCK(p);
 	p->p_vmspace = newvmspace;
 	PROC_VMSPACE_UNLOCK(p);
 	if (p == curthread->td_proc)
 		pmap_activate(curthread);
 	vmspace_free(oldvmspace);
 	return (0);
 }
 
 /*
  *	vm_map_lookup:
  *
  *	Finds the VM object, offset, and
  *	protection for a given virtual address in the
  *	specified map, assuming a page fault of the
  *	type specified.
  *
  *	Leaves the map in question locked for read; return
  *	values are guaranteed until a vm_map_lookup_done
  *	call is performed.  Note that the map argument
  *	is in/out; the returned map must be used in
  *	the call to vm_map_lookup_done.
  *
  *	A handle (out_entry) is returned for use in
  *	vm_map_lookup_done, to make that fast.
  *
  *	If a lookup is requested with "write protection"
  *	specified, the map may be changed to perform virtual
  *	copying operations, although the data referenced will
  *	remain the same.
  */
 int
 vm_map_lookup(vm_map_t *var_map,		/* IN/OUT */
 	      vm_offset_t vaddr,
 	      vm_prot_t fault_typea,
 	      vm_map_entry_t *out_entry,	/* OUT */
 	      vm_object_t *object,		/* OUT */
 	      vm_pindex_t *pindex,		/* OUT */
 	      vm_prot_t *out_prot,		/* OUT */
 	      boolean_t *wired)			/* OUT */
 {
 	vm_map_entry_t entry;
 	vm_map_t map = *var_map;
 	vm_prot_t prot;
 	vm_prot_t fault_type = fault_typea;
 	vm_object_t eobject;
 	vm_size_t size;
 	struct ucred *cred;
 
 RetryLookup:;
 
 	vm_map_lock_read(map);
 
 	/*
 	 * Lookup the faulting address.
 	 */
 	if (!vm_map_lookup_entry(map, vaddr, out_entry)) {
 		vm_map_unlock_read(map);
 		return (KERN_INVALID_ADDRESS);
 	}
 
 	entry = *out_entry;
 
 	/*
 	 * Handle submaps.
 	 */
 	if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) {
 		vm_map_t old_map = map;
 
 		*var_map = map = entry->object.sub_map;
 		vm_map_unlock_read(old_map);
 		goto RetryLookup;
 	}
 
 	/*
 	 * Check whether this task is allowed to have this page.
 	 */
 	prot = entry->protection;
 	fault_type &= (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);
 	if ((fault_type & prot) != fault_type || prot == VM_PROT_NONE) {
 		vm_map_unlock_read(map);
 		return (KERN_PROTECTION_FAILURE);
 	}
 	if ((entry->eflags & MAP_ENTRY_USER_WIRED) &&
 	    (entry->eflags & MAP_ENTRY_COW) &&
 	    (fault_type & VM_PROT_WRITE)) {
 		vm_map_unlock_read(map);
 		return (KERN_PROTECTION_FAILURE);
 	}
 
 	/*
 	 * If this page is not pageable, we have to get it for all possible
 	 * accesses.
 	 */
 	*wired = (entry->wired_count != 0);
 	if (*wired)
 		fault_type = entry->protection;
 	size = entry->end - entry->start;
 	/*
 	 * If the entry was copy-on-write, we either ...
 	 */
 	if (entry->eflags & MAP_ENTRY_NEEDS_COPY) {
 		/*
 		 * If we want to write the page, we may as well handle that
 		 * now since we've got the map locked.
 		 *
 		 * If we don't need to write the page, we just demote the
 		 * permissions allowed.
 		 */
 		if ((fault_type & VM_PROT_WRITE) != 0 ||
 		    (fault_typea & VM_PROT_COPY) != 0) {
 			/*
 			 * Make a new object, and place it in the object
 			 * chain.  Note that no new references have appeared
 			 * -- one just moved from the map to the new
 			 * object.
 			 */
 			if (vm_map_lock_upgrade(map))
 				goto RetryLookup;
 
 			if (entry->cred == NULL) {
 				/*
 				 * The debugger owner is charged for
 				 * the memory.
 				 */
 				cred = curthread->td_ucred;
 				crhold(cred);
 				if (!swap_reserve_by_cred(size, cred)) {
 					crfree(cred);
 					vm_map_unlock(map);
 					return (KERN_RESOURCE_SHORTAGE);
 				}
 				entry->cred = cred;
 			}
 			vm_object_shadow(&entry->object.vm_object,
 			    &entry->offset, size);
 			entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
 			eobject = entry->object.vm_object;
 			if (eobject->cred != NULL) {
 				/*
 				 * The object was not shadowed.
 				 */
 				swap_release_by_cred(size, entry->cred);
 				crfree(entry->cred);
 				entry->cred = NULL;
 			} else if (entry->cred != NULL) {
 				VM_OBJECT_LOCK(eobject);
 				eobject->cred = entry->cred;
 				eobject->charge = size;
 				VM_OBJECT_UNLOCK(eobject);
 				entry->cred = NULL;
 			}
 
 			vm_map_lock_downgrade(map);
 		} else {
 			/*
 			 * We're attempting to read a copy-on-write page --
 			 * don't allow writes.
 			 */
 			prot &= ~VM_PROT_WRITE;
 		}
 	}
 
 	/*
 	 * Create an object if necessary.
 	 */
 	if (entry->object.vm_object == NULL &&
 	    !map->system_map) {
 		if (vm_map_lock_upgrade(map))
 			goto RetryLookup;
 		entry->object.vm_object = vm_object_allocate(OBJT_DEFAULT,
 		    atop(size));
 		entry->offset = 0;
 		if (entry->cred != NULL) {
 			VM_OBJECT_LOCK(entry->object.vm_object);
 			entry->object.vm_object->cred = entry->cred;
 			entry->object.vm_object->charge = size;
 			VM_OBJECT_UNLOCK(entry->object.vm_object);
 			entry->cred = NULL;
 		}
 		vm_map_lock_downgrade(map);
 	}
 
 	/*
 	 * Return the object/offset from this entry.  If the entry was
 	 * copy-on-write or empty, it has been fixed up.
 	 */
 	*pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset);
 	*object = entry->object.vm_object;
 
 	*out_prot = prot;
 	return (KERN_SUCCESS);
 }
 
 /*
  *	vm_map_lookup_locked:
  *
  *	Lookup the faulting address.  A version of vm_map_lookup that returns 
  *      KERN_FAILURE instead of blocking on map lock or memory allocation.
  */
 int
 vm_map_lookup_locked(vm_map_t *var_map,		/* IN/OUT */
 		     vm_offset_t vaddr,
 		     vm_prot_t fault_typea,
 		     vm_map_entry_t *out_entry,	/* OUT */
 		     vm_object_t *object,	/* OUT */
 		     vm_pindex_t *pindex,	/* OUT */
 		     vm_prot_t *out_prot,	/* OUT */
 		     boolean_t *wired)		/* OUT */
 {
 	vm_map_entry_t entry;
 	vm_map_t map = *var_map;
 	vm_prot_t prot;
 	vm_prot_t fault_type = fault_typea;
 
 	/*
 	 * Lookup the faulting address.
 	 */
 	if (!vm_map_lookup_entry(map, vaddr, out_entry))
 		return (KERN_INVALID_ADDRESS);
 
 	entry = *out_entry;
 
 	/*
 	 * Fail if the entry refers to a submap.
 	 */
 	if (entry->eflags & MAP_ENTRY_IS_SUB_MAP)
 		return (KERN_FAILURE);
 
 	/*
 	 * Check whether this task is allowed to have this page.
 	 */
 	prot = entry->protection;
 	fault_type &= VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE;
 	if ((fault_type & prot) != fault_type)
 		return (KERN_PROTECTION_FAILURE);
 	if ((entry->eflags & MAP_ENTRY_USER_WIRED) &&
 	    (entry->eflags & MAP_ENTRY_COW) &&
 	    (fault_type & VM_PROT_WRITE))
 		return (KERN_PROTECTION_FAILURE);
 
 	/*
 	 * If this page is not pageable, we have to get it for all possible
 	 * accesses.
 	 */
 	*wired = (entry->wired_count != 0);
 	if (*wired)
 		fault_type = entry->protection;
 
 	if (entry->eflags & MAP_ENTRY_NEEDS_COPY) {
 		/*
 		 * Fail if the entry was copy-on-write for a write fault.
 		 */
 		if (fault_type & VM_PROT_WRITE)
 			return (KERN_FAILURE);
 		/*
 		 * We're attempting to read a copy-on-write page --
 		 * don't allow writes.
 		 */
 		prot &= ~VM_PROT_WRITE;
 	}
 
 	/*
 	 * Fail if an object should be created.
 	 */
 	if (entry->object.vm_object == NULL && !map->system_map)
 		return (KERN_FAILURE);
 
 	/*
 	 * Return the object/offset from this entry.  If the entry was
 	 * copy-on-write or empty, it has been fixed up.
 	 */
 	*pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset);
 	*object = entry->object.vm_object;
 
 	*out_prot = prot;
 	return (KERN_SUCCESS);
 }
 
 /*
  *	vm_map_lookup_done:
  *
  *	Releases locks acquired by a vm_map_lookup
  *	(according to the handle returned by that lookup).
  */
 void
 vm_map_lookup_done(vm_map_t map, vm_map_entry_t entry)
 {
 	/*
 	 * Unlock the main-level map
 	 */
 	vm_map_unlock_read(map);
 }
 
 #include "opt_ddb.h"
 #ifdef DDB
 #include <sys/kernel.h>
 
 #include <ddb/ddb.h>
 
 static void
 vm_map_print(vm_map_t map)
 {
 	vm_map_entry_t entry;
 
 	db_iprintf("Task map %p: pmap=%p, nentries=%d, version=%u\n",
 	    (void *)map,
 	    (void *)map->pmap, map->nentries, map->timestamp);
 
 	db_indent += 2;
 	for (entry = map->header.next; entry != &map->header;
 	    entry = entry->next) {
 		db_iprintf("map entry %p: start=%p, end=%p\n",
 		    (void *)entry, (void *)entry->start, (void *)entry->end);
 		{
 			static char *inheritance_name[4] =
 			{"share", "copy", "none", "donate_copy"};
 
 			db_iprintf(" prot=%x/%x/%s",
 			    entry->protection,
 			    entry->max_protection,
 			    inheritance_name[(int)(unsigned char)entry->inheritance]);
 			if (entry->wired_count != 0)
 				db_printf(", wired");
 		}
 		if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) {
 			db_printf(", share=%p, offset=0x%jx\n",
 			    (void *)entry->object.sub_map,
 			    (uintmax_t)entry->offset);
 			if ((entry->prev == &map->header) ||
 			    (entry->prev->object.sub_map !=
 				entry->object.sub_map)) {
 				db_indent += 2;
 				vm_map_print((vm_map_t)entry->object.sub_map);
 				db_indent -= 2;
 			}
 		} else {
 			if (entry->cred != NULL)
 				db_printf(", ruid %d", entry->cred->cr_ruid);
 			db_printf(", object=%p, offset=0x%jx",
 			    (void *)entry->object.vm_object,
 			    (uintmax_t)entry->offset);
 			if (entry->object.vm_object && entry->object.vm_object->cred)
 				db_printf(", obj ruid %d charge %jx",
 				    entry->object.vm_object->cred->cr_ruid,
 				    (uintmax_t)entry->object.vm_object->charge);
 			if (entry->eflags & MAP_ENTRY_COW)
 				db_printf(", copy (%s)",
 				    (entry->eflags & MAP_ENTRY_NEEDS_COPY) ? "needed" : "done");
 			db_printf("\n");
 
 			if ((entry->prev == &map->header) ||
 			    (entry->prev->object.vm_object !=
 				entry->object.vm_object)) {
 				db_indent += 2;
 				vm_object_print((db_expr_t)(intptr_t)
 						entry->object.vm_object,
 						1, 0, (char *)0);
 				db_indent -= 2;
 			}
 		}
 	}
 	db_indent -= 2;
 }
 
 DB_SHOW_COMMAND(map, map)
 {
 
 	if (!have_addr) {
 		db_printf("usage: show map <addr>\n");
 		return;
 	}
 	vm_map_print((vm_map_t)addr);
 }
 
 DB_SHOW_COMMAND(procvm, procvm)
 {
 	struct proc *p;
 
 	if (have_addr) {
 		p = (struct proc *) addr;
 	} else {
 		p = curproc;
 	}
 
 	db_printf("p = %p, vmspace = %p, map = %p, pmap = %p\n",
 	    (void *)p, (void *)p->p_vmspace, (void *)&p->p_vmspace->vm_map,
 	    (void *)vmspace_pmap(p->p_vmspace));
 
 	vm_map_print((vm_map_t)&p->p_vmspace->vm_map);
 }
 
 #endif /* DDB */
Index: user/attilio/vmc-playground/sys/vm/vm_object.c
===================================================================
--- user/attilio/vmc-playground/sys/vm/vm_object.c	(revision 247223)
+++ user/attilio/vmc-playground/sys/vm/vm_object.c	(revision 247224)
@@ -1,2399 +1,2400 @@
 /*-
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * The Mach Operating System project at Carnegie-Mellon University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)vm_object.c	8.5 (Berkeley) 3/22/94
  *
  *
  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  * All rights reserved.
  *
  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
  *
  * Permission to use, copy, modify and distribute this software and
  * its documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  *
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  */
 
 /*
  *	Virtual memory object module.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_vm.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/lock.h>
 #include <sys/mman.h>
 #include <sys/mount.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>		/* for curproc, pageproc */
 #include <sys/socket.h>
 #include <sys/resourcevar.h>
 #include <sys/vnode.h>
 #include <sys/vmmeter.h>
 #include <sys/sx.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 #include <vm/swap_pager.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_radix.h>
 #include <vm/vm_reserv.h>
 #include <vm/uma.h>
 
 static int old_msync;
 SYSCTL_INT(_vm, OID_AUTO, old_msync, CTLFLAG_RW, &old_msync, 0,
     "Use old (insecure) msync behavior");
 
 static int	vm_object_page_collect_flush(vm_object_t object, vm_page_t p,
 		    int pagerflags, int flags, boolean_t *clearobjflags,
 		    boolean_t *eio);
 static boolean_t vm_object_page_remove_write(vm_page_t p, int flags,
 		    boolean_t *clearobjflags);
 static void	vm_object_qcollapse(vm_object_t object);
 static void	vm_object_vndeallocate(vm_object_t object);
 
 /*
  *	Virtual memory objects maintain the actual data
  *	associated with allocated virtual memory.  A given
  *	page of memory exists within exactly one object.
  *
  *	An object is only deallocated when all "references"
  *	are given up.  Only one "reference" to a given
  *	region of an object should be writeable.
  *
  *	Associated with each object is a list of all resident
  *	memory pages belonging to that object; this list is
  *	maintained by the "vm_page" module, and locked by the object's
  *	lock.
  *
  *	Each object also records a "pager" routine which is
  *	used to retrieve (and store) pages to the proper backing
  *	storage.  In addition, objects may be backed by other
  *	objects from which they were virtual-copied.
  *
  *	The only items within the object structure which are
  *	modified after time of creation are:
  *		reference count		locked by object's lock
  *		pager routine		locked by object's lock
  *
  */
 
 struct object_q vm_object_list;
 struct mtx vm_object_list_mtx;	/* lock for object list and count */
 
 struct vm_object kernel_object_store;
 struct vm_object kmem_object_store;
 
 static SYSCTL_NODE(_vm_stats, OID_AUTO, object, CTLFLAG_RD, 0,
     "VM object stats");
 
 static long object_collapses;
 SYSCTL_LONG(_vm_stats_object, OID_AUTO, collapses, CTLFLAG_RD,
     &object_collapses, 0, "VM object collapses");
 
 static long object_bypasses;
 SYSCTL_LONG(_vm_stats_object, OID_AUTO, bypasses, CTLFLAG_RD,
     &object_bypasses, 0, "VM object bypasses");
 
 static uma_zone_t obj_zone;
 
 static int vm_object_zinit(void *mem, int size, int flags);
 
 #ifdef INVARIANTS
 static void vm_object_zdtor(void *mem, int size, void *arg);
 
 static void
 vm_object_zdtor(void *mem, int size, void *arg)
 {
 	vm_object_t object;
 
 	object = (vm_object_t)mem;
 	KASSERT(TAILQ_EMPTY(&object->memq),
 	    ("object %p has resident pages",
 	    object));
 #if VM_NRESERVLEVEL > 0
 	KASSERT(LIST_EMPTY(&object->rvq),
 	    ("object %p has reservations",
 	    object));
 #endif
 	KASSERT(vm_object_cache_is_empty(object),
 	    ("object %p has cached pages",
 	    object));
 	KASSERT(object->paging_in_progress == 0,
 	    ("object %p paging_in_progress = %d",
 	    object, object->paging_in_progress));
 	KASSERT(object->resident_page_count == 0,
 	    ("object %p resident_page_count = %d",
 	    object, object->resident_page_count));
 	KASSERT(object->shadow_count == 0,
 	    ("object %p shadow_count = %d",
 	    object, object->shadow_count));
 }
 #endif
 
 static int
 vm_object_zinit(void *mem, int size, int flags)
 {
 	vm_object_t object;
 
 	object = (vm_object_t)mem;
-	bzero(&object->mtx, sizeof(object->mtx));
-	VM_OBJECT_LOCK_INIT(object, "standard object");
 
 	/* These are true for any object that has been freed */
 	object->paging_in_progress = 0;
 	object->resident_page_count = 0;
 	object->shadow_count = 0;
+
+	/* It relies on vm object mutex to be initialized afterwards. */
 	return (0);
 }
 
-void
-_vm_object_allocate(objtype_t type, vm_pindex_t size, vm_object_t object)
+static void
+_vm_object_allocate(objtype_t type, vm_pindex_t size, vm_object_t object,
+    const char *mtxname)
 {
 
+	bzero(&object->mtx, sizeof(object->mtx));
+	mtx_init(&object->mtx, "vm object", mtxname, MTX_DEF | MTX_DUPOK);
 	TAILQ_INIT(&object->memq);
 	LIST_INIT(&object->shadow_head);
 
 	object->rtree.rt_root = 0;
 	object->type = type;
 	switch (type) {
 	case OBJT_DEAD:
 		panic("_vm_object_allocate: can't create OBJT_DEAD");
 	case OBJT_DEFAULT:
 	case OBJT_SWAP:
 		object->flags = OBJ_ONEMAPPING;
 		break;
 	case OBJT_DEVICE:
 	case OBJT_SG:
 		object->flags = OBJ_FICTITIOUS | OBJ_UNMANAGED;
 		break;
 	case OBJT_MGTDEVICE:
 		object->flags = OBJ_FICTITIOUS;
 		break;
 	case OBJT_PHYS:
 		object->flags = OBJ_UNMANAGED;
 		break;
 	case OBJT_VNODE:
 		object->flags = 0;
 		break;
 	default:
 		panic("_vm_object_allocate: type %d is undefined", type);
 	}
 	object->size = size;
 	object->generation = 1;
 	object->ref_count = 1;
 	object->memattr = VM_MEMATTR_DEFAULT;
 	object->cred = NULL;
 	object->charge = 0;
 	object->pg_color = 0;
 	object->handle = NULL;
 	object->backing_object = NULL;
 	object->backing_object_offset = (vm_ooffset_t) 0;
 #if VM_NRESERVLEVEL > 0
 	LIST_INIT(&object->rvq);
 #endif
 	object->cache.rt_root = 0;
 
 	mtx_lock(&vm_object_list_mtx);
 	TAILQ_INSERT_TAIL(&vm_object_list, object, object_list);
 	mtx_unlock(&vm_object_list_mtx);
 }
 
 /*
  *	vm_object_init:
  *
  *	Initialize the VM objects module.
  */
 void
 vm_object_init(void)
 {
 	TAILQ_INIT(&vm_object_list);
 	mtx_init(&vm_object_list_mtx, "vm object_list", NULL, MTX_DEF);
 	
-	VM_OBJECT_LOCK_INIT(kernel_object, "kernel object");
 	_vm_object_allocate(OBJT_PHYS, OFF_TO_IDX(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS),
-	    kernel_object);
+	    kernel_object, "kernel object");
 #if VM_NRESERVLEVEL > 0
 	kernel_object->flags |= OBJ_COLORED;
 	kernel_object->pg_color = (u_short)atop(VM_MIN_KERNEL_ADDRESS);
 #endif
 
-	VM_OBJECT_LOCK_INIT(kmem_object, "kmem object");
 	_vm_object_allocate(OBJT_PHYS, OFF_TO_IDX(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS),
-	    kmem_object);
+	    kmem_object, "kmem object");
 #if VM_NRESERVLEVEL > 0
 	kmem_object->flags |= OBJ_COLORED;
 	kmem_object->pg_color = (u_short)atop(VM_MIN_KERNEL_ADDRESS);
 #endif
 
 	/*
 	 * The lock portion of struct vm_object must be type stable due
 	 * to vm_pageout_fallback_object_lock locking a vm object
 	 * without holding any references to it.
 	 */
 	obj_zone = uma_zcreate("VM OBJECT", sizeof (struct vm_object), NULL,
 #ifdef INVARIANTS
 	    vm_object_zdtor,
 #else
 	    NULL,
 #endif
 	    vm_object_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM|UMA_ZONE_NOFREE);
 }
 
 void
 vm_object_clear_flag(vm_object_t object, u_short bits)
 {
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	object->flags &= ~bits;
 }
 
 /*
  *	Sets the default memory attribute for the specified object.  Pages
  *	that are allocated to this object are by default assigned this memory
  *	attribute.
  *
  *	Presently, this function must be called before any pages are allocated
  *	to the object.  In the future, this requirement may be relaxed for
  *	"default" and "swap" objects.
  */
 int
 vm_object_set_memattr(vm_object_t object, vm_memattr_t memattr)
 {
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	switch (object->type) {
 	case OBJT_DEFAULT:
 	case OBJT_DEVICE:
 	case OBJT_MGTDEVICE:
 	case OBJT_PHYS:
 	case OBJT_SG:
 	case OBJT_SWAP:
 	case OBJT_VNODE:
 		if (!TAILQ_EMPTY(&object->memq))
 			return (KERN_FAILURE);
 		break;
 	case OBJT_DEAD:
 		return (KERN_INVALID_ARGUMENT);
 	default:
 		panic("vm_object_set_memattr: object %p is of undefined type",
 		    object);
 	}
 	object->memattr = memattr;
 	return (KERN_SUCCESS);
 }
 
 void
 vm_object_pip_add(vm_object_t object, short i)
 {
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	object->paging_in_progress += i;
 }
 
 void
 vm_object_pip_subtract(vm_object_t object, short i)
 {
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	object->paging_in_progress -= i;
 }
 
 void
 vm_object_pip_wakeup(vm_object_t object)
 {
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	object->paging_in_progress--;
 	if ((object->flags & OBJ_PIPWNT) && object->paging_in_progress == 0) {
 		vm_object_clear_flag(object, OBJ_PIPWNT);
 		wakeup(object);
 	}
 }
 
 void
 vm_object_pip_wakeupn(vm_object_t object, short i)
 {
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	if (i)
 		object->paging_in_progress -= i;
 	if ((object->flags & OBJ_PIPWNT) && object->paging_in_progress == 0) {
 		vm_object_clear_flag(object, OBJ_PIPWNT);
 		wakeup(object);
 	}
 }
 
 void
 vm_object_pip_wait(vm_object_t object, char *waitid)
 {
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	while (object->paging_in_progress) {
 		object->flags |= OBJ_PIPWNT;
 		msleep(object, VM_OBJECT_MTX(object), PVM, waitid, 0);
 	}
 }
 
 /*
  *	vm_object_allocate:
  *
  *	Returns a new object with the given size.
  */
 vm_object_t
 vm_object_allocate(objtype_t type, vm_pindex_t size)
 {
 	vm_object_t object;
 
 	object = (vm_object_t)uma_zalloc(obj_zone, M_WAITOK);
-	_vm_object_allocate(type, size, object);
+	_vm_object_allocate(type, size, object, NULL);
 	return (object);
 }
 
 
 /*
  *	vm_object_reference:
  *
  *	Gets another reference to the given object.  Note: OBJ_DEAD
  *	objects can be referenced during final cleaning.
  */
 void
 vm_object_reference(vm_object_t object)
 {
 	if (object == NULL)
 		return;
 	VM_OBJECT_LOCK(object);
 	vm_object_reference_locked(object);
 	VM_OBJECT_UNLOCK(object);
 }
 
 /*
  *	vm_object_reference_locked:
  *
  *	Gets another reference to the given object.
  *
  *	The object must be locked.
  */
 void
 vm_object_reference_locked(vm_object_t object)
 {
 	struct vnode *vp;
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	object->ref_count++;
 	if (object->type == OBJT_VNODE) {
 		vp = object->handle;
 		vref(vp);
 	}
 }
 
 /*
  * Handle deallocating an object of type OBJT_VNODE.
  */
 static void
 vm_object_vndeallocate(vm_object_t object)
 {
 	struct vnode *vp = (struct vnode *) object->handle;
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	KASSERT(object->type == OBJT_VNODE,
 	    ("vm_object_vndeallocate: not a vnode object"));
 	KASSERT(vp != NULL, ("vm_object_vndeallocate: missing vp"));
 #ifdef INVARIANTS
 	if (object->ref_count == 0) {
 		vprint("vm_object_vndeallocate", vp);
 		panic("vm_object_vndeallocate: bad object reference count");
 	}
 #endif
 
 	if (object->ref_count > 1) {
 		object->ref_count--;
 		VM_OBJECT_UNLOCK(object);
 		/* vrele may need the vnode lock. */
 		vrele(vp);
 	} else {
 		vhold(vp);
 		VM_OBJECT_UNLOCK(object);
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 		vdrop(vp);
 		VM_OBJECT_LOCK(object);
 		object->ref_count--;
 		if (object->type == OBJT_DEAD) {
 			VM_OBJECT_UNLOCK(object);
 			VOP_UNLOCK(vp, 0);
 		} else {
 			if (object->ref_count == 0)
 				VOP_UNSET_TEXT(vp);
 			VM_OBJECT_UNLOCK(object);
 			vput(vp);
 		}
 	}
 }
 
 /*
  *	vm_object_deallocate:
  *
  *	Release a reference to the specified object,
  *	gained either through a vm_object_allocate
  *	or a vm_object_reference call.  When all references
  *	are gone, storage associated with this object
  *	may be relinquished.
  *
  *	No object may be locked.
  */
 void
 vm_object_deallocate(vm_object_t object)
 {
 	vm_object_t temp;
 
 	while (object != NULL) {
 		VM_OBJECT_LOCK(object);
 		if (object->type == OBJT_VNODE) {
 			vm_object_vndeallocate(object);
 			return;
 		}
 
 		KASSERT(object->ref_count != 0,
 			("vm_object_deallocate: object deallocated too many times: %d", object->type));
 
 		/*
 		 * If the reference count goes to 0 we start calling
 		 * vm_object_terminate() on the object chain.
 		 * A ref count of 1 may be a special case depending on the
 		 * shadow count being 0 or 1.
 		 */
 		object->ref_count--;
 		if (object->ref_count > 1) {
 			VM_OBJECT_UNLOCK(object);
 			return;
 		} else if (object->ref_count == 1) {
 			if (object->shadow_count == 0 &&
 			    object->handle == NULL &&
 			    (object->type == OBJT_DEFAULT ||
 			     object->type == OBJT_SWAP)) {
 				vm_object_set_flag(object, OBJ_ONEMAPPING);
 			} else if ((object->shadow_count == 1) &&
 			    (object->handle == NULL) &&
 			    (object->type == OBJT_DEFAULT ||
 			     object->type == OBJT_SWAP)) {
 				vm_object_t robject;
 
 				robject = LIST_FIRST(&object->shadow_head);
 				KASSERT(robject != NULL,
 				    ("vm_object_deallocate: ref_count: %d, shadow_count: %d",
 					 object->ref_count,
 					 object->shadow_count));
 				if (!VM_OBJECT_TRYLOCK(robject)) {
 					/*
 					 * Avoid a potential deadlock.
 					 */
 					object->ref_count++;
 					VM_OBJECT_UNLOCK(object);
 					/*
 					 * More likely than not the thread
 					 * holding robject's lock has lower
 					 * priority than the current thread.
 					 * Let the lower priority thread run.
 					 */
 					pause("vmo_de", 1);
 					continue;
 				}
 				/*
 				 * Collapse object into its shadow unless its
 				 * shadow is dead.  In that case, object will
 				 * be deallocated by the thread that is
 				 * deallocating its shadow.
 				 */
 				if ((robject->flags & OBJ_DEAD) == 0 &&
 				    (robject->handle == NULL) &&
 				    (robject->type == OBJT_DEFAULT ||
 				     robject->type == OBJT_SWAP)) {
 
 					robject->ref_count++;
 retry:
 					if (robject->paging_in_progress) {
 						VM_OBJECT_UNLOCK(object);
 						vm_object_pip_wait(robject,
 						    "objde1");
 						temp = robject->backing_object;
 						if (object == temp) {
 							VM_OBJECT_LOCK(object);
 							goto retry;
 						}
 					} else if (object->paging_in_progress) {
 						VM_OBJECT_UNLOCK(robject);
 						object->flags |= OBJ_PIPWNT;
 						msleep(object,
 						    VM_OBJECT_MTX(object),
 						    PDROP | PVM, "objde2", 0);
 						VM_OBJECT_LOCK(robject);
 						temp = robject->backing_object;
 						if (object == temp) {
 							VM_OBJECT_LOCK(object);
 							goto retry;
 						}
 					} else
 						VM_OBJECT_UNLOCK(object);
 
 					if (robject->ref_count == 1) {
 						robject->ref_count--;
 						object = robject;
 						goto doterm;
 					}
 					object = robject;
 					vm_object_collapse(object);
 					VM_OBJECT_UNLOCK(object);
 					continue;
 				}
 				VM_OBJECT_UNLOCK(robject);
 			}
 			VM_OBJECT_UNLOCK(object);
 			return;
 		}
 doterm:
 		temp = object->backing_object;
 		if (temp != NULL) {
 			VM_OBJECT_LOCK(temp);
 			LIST_REMOVE(object, shadow_list);
 			temp->shadow_count--;
 			VM_OBJECT_UNLOCK(temp);
 			object->backing_object = NULL;
 		}
 		/*
 		 * Don't double-terminate, we could be in a termination
 		 * recursion due to the terminate having to sync data
 		 * to disk.
 		 */
 		if ((object->flags & OBJ_DEAD) == 0)
 			vm_object_terminate(object);
 		else
 			VM_OBJECT_UNLOCK(object);
 		object = temp;
 	}
 }
 
 /*
  *	vm_object_destroy removes the object from the global object list
  *      and frees the space for the object.
  */
 void
 vm_object_destroy(vm_object_t object)
 {
 
 	/*
 	 * Remove the object from the global object list.
 	 */
 	mtx_lock(&vm_object_list_mtx);
 	TAILQ_REMOVE(&vm_object_list, object, object_list);
 	mtx_unlock(&vm_object_list_mtx);
 
 	/*
 	 * Release the allocation charge.
 	 */
 	if (object->cred != NULL) {
 		KASSERT(object->type == OBJT_DEFAULT ||
 		    object->type == OBJT_SWAP,
 		    ("vm_object_terminate: non-swap obj %p has cred",
 		     object));
 		swap_release_by_cred(object->charge, object->cred);
 		object->charge = 0;
 		crfree(object->cred);
 		object->cred = NULL;
 	}
 
 	/*
 	 * Free the space for the object.
 	 */
 	uma_zfree(obj_zone, object);
 }
 
 /*
  *	vm_object_terminate actually destroys the specified object, freeing
  *	up all previously used resources.
  *
  *	The object must be locked.
  *	This routine may block.
  */
 void
 vm_object_terminate(vm_object_t object)
 {
 	vm_page_t p, p_next;
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 
 	/*
 	 * Make sure no one uses us.
 	 */
 	vm_object_set_flag(object, OBJ_DEAD);
 
 	/*
 	 * wait for the pageout daemon to be done with the object
 	 */
 	vm_object_pip_wait(object, "objtrm");
 
 	KASSERT(!object->paging_in_progress,
 		("vm_object_terminate: pageout in progress"));
 
 	/*
 	 * Clean and free the pages, as appropriate. All references to the
 	 * object are gone, so we don't need to lock it.
 	 */
 	if (object->type == OBJT_VNODE) {
 		struct vnode *vp = (struct vnode *)object->handle;
 
 		/*
 		 * Clean pages and flush buffers.
 		 */
 		vm_object_page_clean(object, 0, 0, OBJPC_SYNC);
 		VM_OBJECT_UNLOCK(object);
 
 		vinvalbuf(vp, V_SAVE, 0, 0);
 
 		VM_OBJECT_LOCK(object);
 	}
 
 	KASSERT(object->ref_count == 0, 
 		("vm_object_terminate: object with references, ref_count=%d",
 		object->ref_count));
 
 	/*
 	 * Free any remaining pageable pages.  This also removes them from the
 	 * paging queues.  However, don't free wired pages, just remove them
 	 * from the object.  Rather than incrementally removing each page from
 	 * the object, the page and object are reset to any empty state. 
 	 */
 	TAILQ_FOREACH_SAFE(p, &object->memq, listq, p_next) {
 		KASSERT(!p->busy && (p->oflags & VPO_BUSY) == 0,
 		    ("vm_object_terminate: freeing busy page %p", p));
 		vm_page_lock(p);
 		/*
 		 * Optimize the page's removal from the object by resetting
 		 * its "object" field.  Specifically, if the page is not
 		 * wired, then the effect of this assignment is that
 		 * vm_page_free()'s call to vm_page_remove() will return
 		 * immediately without modifying the page or the object.
 		 */ 
 		p->object = NULL;
 		if (p->wire_count == 0) {
 			vm_page_free(p);
 			PCPU_INC(cnt.v_pfree);
 		}
 		vm_page_unlock(p);
 	}
 	vm_radix_reclaim_allnodes(&object->rtree);
 	/*
 	 * If the object contained any pages, then reset it to an empty state.
 	 * None of the object's fields, including "resident_page_count", were
 	 * modified by the preceding loop.
 	 */
 	if (object->resident_page_count != 0) {
 		TAILQ_INIT(&object->memq);
 		object->resident_page_count = 0;
 		if (object->type == OBJT_VNODE)
 			vdrop(object->handle);
 	}
 
 #if VM_NRESERVLEVEL > 0
 	if (__predict_false(!LIST_EMPTY(&object->rvq)))
 		vm_reserv_break_all(object);
 #endif
 	if (!vm_object_cache_is_empty(object))
 		vm_page_cache_free(object, 0, 0);
 
 	/*
 	 * Let the pager know object is dead.
 	 */
 	vm_pager_deallocate(object);
 	VM_OBJECT_UNLOCK(object);
 
 	vm_object_destroy(object);
 }
 
 /*
  * Make the page read-only so that we can clear the object flags.  However, if
  * this is a nosync mmap then the object is likely to stay dirty so do not
  * mess with the page and do not clear the object flags.  Returns TRUE if the
  * page should be flushed, and FALSE otherwise.
  */
 static boolean_t
 vm_object_page_remove_write(vm_page_t p, int flags, boolean_t *clearobjflags)
 {
 
 	/*
 	 * If we have been asked to skip nosync pages and this is a
 	 * nosync page, skip it.  Note that the object flags were not
 	 * cleared in this case so we do not have to set them.
 	 */
 	if ((flags & OBJPC_NOSYNC) != 0 && (p->oflags & VPO_NOSYNC) != 0) {
 		*clearobjflags = FALSE;
 		return (FALSE);
 	} else {
 		pmap_remove_write(p);
 		return (p->dirty != 0);
 	}
 }
 
 /*
  *	vm_object_page_clean
  *
  *	Clean all dirty pages in the specified range of object.  Leaves page 
  * 	on whatever queue it is currently on.   If NOSYNC is set then do not
  *	write out pages with VPO_NOSYNC set (originally comes from MAP_NOSYNC),
  *	leaving the object dirty.
  *
  *	When stuffing pages asynchronously, allow clustering.  XXX we need a
  *	synchronous clustering mode implementation.
  *
  *	Odd semantics: if start == end, we clean everything.
  *
  *	The object must be locked.
  *
  *	Returns FALSE if some page from the range was not written, as
  *	reported by the pager, and TRUE otherwise.
  */
 boolean_t
 vm_object_page_clean(vm_object_t object, vm_ooffset_t start, vm_ooffset_t end,
     int flags)
 {
 	vm_page_t np, p;
 	vm_pindex_t pi, tend, tstart;
 	int curgeneration, n, pagerflags;
 	boolean_t clearobjflags, eio, res;
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	KASSERT(object->type == OBJT_VNODE, ("Not a vnode object"));
 	if ((object->flags & OBJ_MIGHTBEDIRTY) == 0 ||
 	    object->resident_page_count == 0)
 		return (TRUE);
 
 	pagerflags = (flags & (OBJPC_SYNC | OBJPC_INVAL)) != 0 ?
 	    VM_PAGER_PUT_SYNC : VM_PAGER_CLUSTER_OK;
 	pagerflags |= (flags & OBJPC_INVAL) != 0 ? VM_PAGER_PUT_INVAL : 0;
 
 	tstart = OFF_TO_IDX(start);
 	tend = (end == 0) ? object->size : OFF_TO_IDX(end + PAGE_MASK);
 	clearobjflags = tstart == 0 && tend >= object->size;
 	res = TRUE;
 
 rescan:
 	curgeneration = object->generation;
 
 	for (p = vm_page_find_least(object, tstart); p != NULL; p = np) {
 		pi = p->pindex;
 		if (pi >= tend)
 			break;
 		np = TAILQ_NEXT(p, listq);
 		if (p->valid == 0)
 			continue;
 		if (vm_page_sleep_if_busy(p, TRUE, "vpcwai")) {
 			if (object->generation != curgeneration) {
 				if ((flags & OBJPC_SYNC) != 0)
 					goto rescan;
 				else
 					clearobjflags = FALSE;
 			}
 			np = vm_page_find_least(object, pi);
 			continue;
 		}
 		if (!vm_object_page_remove_write(p, flags, &clearobjflags))
 			continue;
 
 		n = vm_object_page_collect_flush(object, p, pagerflags,
 		    flags, &clearobjflags, &eio);
 		if (eio) {
 			res = FALSE;
 			clearobjflags = FALSE;
 		}
 		if (object->generation != curgeneration) {
 			if ((flags & OBJPC_SYNC) != 0)
 				goto rescan;
 			else
 				clearobjflags = FALSE;
 		}
 
 		/*
 		 * If the VOP_PUTPAGES() did a truncated write, so
 		 * that even the first page of the run is not fully
 		 * written, vm_pageout_flush() returns 0 as the run
 		 * length.  Since the condition that caused truncated
 		 * write may be permanent, e.g. exhausted free space,
 		 * accepting n == 0 would cause an infinite loop.
 		 *
 		 * Forwarding the iterator leaves the unwritten page
 		 * behind, but there is not much we can do there if
 		 * filesystem refuses to write it.
 		 */
 		if (n == 0) {
 			n = 1;
 			clearobjflags = FALSE;
 		}
 		np = vm_page_find_least(object, pi + n);
 	}
 #if 0
 	VOP_FSYNC(vp, (pagerflags & VM_PAGER_PUT_SYNC) ? MNT_WAIT : 0);
 #endif
 
 	if (clearobjflags)
 		vm_object_clear_flag(object, OBJ_MIGHTBEDIRTY);
 	return (res);
 }
 
 static int
 vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int pagerflags,
     int flags, boolean_t *clearobjflags, boolean_t *eio)
 {
 	vm_page_t ma[vm_pageout_page_count], p_first, tp;
 	int count, i, mreq, runlen;
 
 	vm_page_lock_assert(p, MA_NOTOWNED);
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 
 	count = 1;
 	mreq = 0;
 
 	for (tp = p; count < vm_pageout_page_count; count++) {
 		tp = vm_page_next(tp);
 		if (tp == NULL || tp->busy != 0 || (tp->oflags & VPO_BUSY) != 0)
 			break;
 		if (!vm_object_page_remove_write(tp, flags, clearobjflags))
 			break;
 	}
 
 	for (p_first = p; count < vm_pageout_page_count; count++) {
 		tp = vm_page_prev(p_first);
 		if (tp == NULL || tp->busy != 0 || (tp->oflags & VPO_BUSY) != 0)
 			break;
 		if (!vm_object_page_remove_write(tp, flags, clearobjflags))
 			break;
 		p_first = tp;
 		mreq++;
 	}
 
 	for (tp = p_first, i = 0; i < count; tp = TAILQ_NEXT(tp, listq), i++)
 		ma[i] = tp;
 
 	vm_pageout_flush(ma, count, pagerflags, mreq, &runlen, eio);
 	return (runlen);
 }
 
 /*
  * Note that there is absolutely no sense in writing out
  * anonymous objects, so we track down the vnode object
  * to write out.
  * We invalidate (remove) all pages from the address space
  * for semantic correctness.
  *
  * If the backing object is a device object with unmanaged pages, then any
  * mappings to the specified range of pages must be removed before this
  * function is called.
  *
  * Note: certain anonymous maps, such as MAP_NOSYNC maps,
  * may start out with a NULL object.
  */
 boolean_t
 vm_object_sync(vm_object_t object, vm_ooffset_t offset, vm_size_t size,
     boolean_t syncio, boolean_t invalidate)
 {
 	vm_object_t backing_object;
 	struct vnode *vp;
 	struct mount *mp;
 	int error, flags, fsync_after;
 	boolean_t res;
 
 	if (object == NULL)
 		return (TRUE);
 	res = TRUE;
 	error = 0;
 	VM_OBJECT_LOCK(object);
 	while ((backing_object = object->backing_object) != NULL) {
 		VM_OBJECT_LOCK(backing_object);
 		offset += object->backing_object_offset;
 		VM_OBJECT_UNLOCK(object);
 		object = backing_object;
 		if (object->size < OFF_TO_IDX(offset + size))
 			size = IDX_TO_OFF(object->size) - offset;
 	}
 	/*
 	 * Flush pages if writing is allowed, invalidate them
 	 * if invalidation requested.  Pages undergoing I/O
 	 * will be ignored by vm_object_page_remove().
 	 *
 	 * We cannot lock the vnode and then wait for paging
 	 * to complete without deadlocking against vm_fault.
 	 * Instead we simply call vm_object_page_remove() and
 	 * allow it to block internally on a page-by-page
 	 * basis when it encounters pages undergoing async
 	 * I/O.
 	 */
 	if (object->type == OBJT_VNODE &&
 	    (object->flags & OBJ_MIGHTBEDIRTY) != 0) {
 		vp = object->handle;
 		VM_OBJECT_UNLOCK(object);
 		(void) vn_start_write(vp, &mp, V_WAIT);
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 		if (syncio && !invalidate && offset == 0 &&
 		    OFF_TO_IDX(size) == object->size) {
 			/*
 			 * If syncing the whole mapping of the file,
 			 * it is faster to schedule all the writes in
 			 * async mode, also allowing the clustering,
 			 * and then wait for i/o to complete.
 			 */
 			flags = 0;
 			fsync_after = TRUE;
 		} else {
 			flags = (syncio || invalidate) ? OBJPC_SYNC : 0;
 			flags |= invalidate ? (OBJPC_SYNC | OBJPC_INVAL) : 0;
 			fsync_after = FALSE;
 		}
 		VM_OBJECT_LOCK(object);
 		res = vm_object_page_clean(object, offset, offset + size,
 		    flags);
 		VM_OBJECT_UNLOCK(object);
 		if (fsync_after)
 			error = VOP_FSYNC(vp, MNT_WAIT, curthread);
 		VOP_UNLOCK(vp, 0);
 		vn_finished_write(mp);
 		if (error != 0)
 			res = FALSE;
 		VM_OBJECT_LOCK(object);
 	}
 	if ((object->type == OBJT_VNODE ||
 	     object->type == OBJT_DEVICE) && invalidate) {
 		if (object->type == OBJT_DEVICE)
 			/*
 			 * The option OBJPR_NOTMAPPED must be passed here
 			 * because vm_object_page_remove() cannot remove
 			 * unmanaged mappings.
 			 */
 			flags = OBJPR_NOTMAPPED;
 		else if (old_msync)
 			flags = 0;
 		else
 			flags = OBJPR_CLEANONLY;
 		vm_object_page_remove(object, OFF_TO_IDX(offset),
 		    OFF_TO_IDX(offset + size + PAGE_MASK), flags);
 	}
 	VM_OBJECT_UNLOCK(object);
 	return (res);
 }
 
 /*
  *	vm_object_madvise:
  *
  *	Implements the madvise function at the object/page level.
  *
  *	MADV_WILLNEED	(any object)
  *
  *	    Activate the specified pages if they are resident.
  *
  *	MADV_DONTNEED	(any object)
  *
  *	    Deactivate the specified pages if they are resident.
  *
  *	MADV_FREE	(OBJT_DEFAULT/OBJT_SWAP objects,
  *			 OBJ_ONEMAPPING only)
  *
  *	    Deactivate and clean the specified pages if they are
  *	    resident.  This permits the process to reuse the pages
  *	    without faulting or the kernel to reclaim the pages
  *	    without I/O.
  */
 void
 vm_object_madvise(vm_object_t object, vm_pindex_t pindex, vm_pindex_t end,
     int advise)
 {
 	vm_pindex_t tpindex;
 	vm_object_t backing_object, tobject;
 	vm_page_t m;
 
 	if (object == NULL)
 		return;
 	VM_OBJECT_LOCK(object);
 	/*
 	 * Locate and adjust resident pages
 	 */
 	for (; pindex < end; pindex += 1) {
 relookup:
 		tobject = object;
 		tpindex = pindex;
 shadowlookup:
 		/*
 		 * MADV_FREE only operates on OBJT_DEFAULT or OBJT_SWAP pages
 		 * and those pages must be OBJ_ONEMAPPING.
 		 */
 		if (advise == MADV_FREE) {
 			if ((tobject->type != OBJT_DEFAULT &&
 			     tobject->type != OBJT_SWAP) ||
 			    (tobject->flags & OBJ_ONEMAPPING) == 0) {
 				goto unlock_tobject;
 			}
 		} else if ((tobject->flags & OBJ_UNMANAGED) != 0)
 			goto unlock_tobject;
 		m = vm_page_lookup(tobject, tpindex);
 		if (m == NULL && advise == MADV_WILLNEED) {
 			/*
 			 * If the page is cached, reactivate it.
 			 */
 			m = vm_page_alloc(tobject, tpindex, VM_ALLOC_IFCACHED |
 			    VM_ALLOC_NOBUSY);
 		}
 		if (m == NULL) {
 			/*
 			 * There may be swap even if there is no backing page
 			 */
 			if (advise == MADV_FREE && tobject->type == OBJT_SWAP)
 				swap_pager_freespace(tobject, tpindex, 1);
 			/*
 			 * next object
 			 */
 			backing_object = tobject->backing_object;
 			if (backing_object == NULL)
 				goto unlock_tobject;
 			VM_OBJECT_LOCK(backing_object);
 			tpindex += OFF_TO_IDX(tobject->backing_object_offset);
 			if (tobject != object)
 				VM_OBJECT_UNLOCK(tobject);
 			tobject = backing_object;
 			goto shadowlookup;
 		} else if (m->valid != VM_PAGE_BITS_ALL)
 			goto unlock_tobject;
 		/*
 		 * If the page is not in a normal state, skip it.
 		 */
 		vm_page_lock(m);
 		if (m->hold_count != 0 || m->wire_count != 0) {
 			vm_page_unlock(m);
 			goto unlock_tobject;
 		}
 		KASSERT((m->flags & PG_FICTITIOUS) == 0,
 		    ("vm_object_madvise: page %p is fictitious", m));
 		KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 		    ("vm_object_madvise: page %p is not managed", m));
 		if ((m->oflags & VPO_BUSY) || m->busy) {
 			if (advise == MADV_WILLNEED) {
 				/*
 				 * Reference the page before unlocking and
 				 * sleeping so that the page daemon is less
 				 * likely to reclaim it. 
 				 */
 				vm_page_aflag_set(m, PGA_REFERENCED);
 			}
 			vm_page_unlock(m);
 			if (object != tobject)
 				VM_OBJECT_UNLOCK(object);
 			m->oflags |= VPO_WANTED;
 			msleep(m, VM_OBJECT_MTX(tobject), PDROP | PVM, "madvpo",
 			    0);
 			VM_OBJECT_LOCK(object);
   			goto relookup;
 		}
 		if (advise == MADV_WILLNEED) {
 			vm_page_activate(m);
 		} else if (advise == MADV_DONTNEED) {
 			vm_page_dontneed(m);
 		} else if (advise == MADV_FREE) {
 			/*
 			 * Mark the page clean.  This will allow the page
 			 * to be freed up by the system.  However, such pages
 			 * are often reused quickly by malloc()/free()
 			 * so we do not do anything that would cause
 			 * a page fault if we can help it.
 			 *
 			 * Specifically, we do not try to actually free
 			 * the page now nor do we try to put it in the
 			 * cache (which would cause a page fault on reuse).
 			 *
 			 * But we do make the page is freeable as we
 			 * can without actually taking the step of unmapping
 			 * it.
 			 */
 			pmap_clear_modify(m);
 			m->dirty = 0;
 			m->act_count = 0;
 			vm_page_dontneed(m);
 		}
 		vm_page_unlock(m);
 		if (advise == MADV_FREE && tobject->type == OBJT_SWAP)
 			swap_pager_freespace(tobject, tpindex, 1);
 unlock_tobject:
 		if (tobject != object)
 			VM_OBJECT_UNLOCK(tobject);
 	}	
 	VM_OBJECT_UNLOCK(object);
 }
 
 /*
  *	vm_object_shadow:
  *
  *	Create a new object which is backed by the
  *	specified existing object range.  The source
  *	object reference is deallocated.
  *
  *	The new object and offset into that object
  *	are returned in the source parameters.
  */
 void
 vm_object_shadow(
 	vm_object_t *object,	/* IN/OUT */
 	vm_ooffset_t *offset,	/* IN/OUT */
 	vm_size_t length)
 {
 	vm_object_t source;
 	vm_object_t result;
 
 	source = *object;
 
 	/*
 	 * Don't create the new object if the old object isn't shared.
 	 */
 	if (source != NULL) {
 		VM_OBJECT_LOCK(source);
 		if (source->ref_count == 1 &&
 		    source->handle == NULL &&
 		    (source->type == OBJT_DEFAULT ||
 		     source->type == OBJT_SWAP)) {
 			VM_OBJECT_UNLOCK(source);
 			return;
 		}
 		VM_OBJECT_UNLOCK(source);
 	}
 
 	/*
 	 * Allocate a new object with the given length.
 	 */
 	result = vm_object_allocate(OBJT_DEFAULT, atop(length));
 
 	/*
 	 * The new object shadows the source object, adding a reference to it.
 	 * Our caller changes his reference to point to the new object,
 	 * removing a reference to the source object.  Net result: no change
 	 * of reference count.
 	 *
 	 * Try to optimize the result object's page color when shadowing
 	 * in order to maintain page coloring consistency in the combined 
 	 * shadowed object.
 	 */
 	result->backing_object = source;
 	/*
 	 * Store the offset into the source object, and fix up the offset into
 	 * the new object.
 	 */
 	result->backing_object_offset = *offset;
 	if (source != NULL) {
 		VM_OBJECT_LOCK(source);
 		LIST_INSERT_HEAD(&source->shadow_head, result, shadow_list);
 		source->shadow_count++;
 #if VM_NRESERVLEVEL > 0
 		result->flags |= source->flags & OBJ_COLORED;
 		result->pg_color = (source->pg_color + OFF_TO_IDX(*offset)) &
 		    ((1 << (VM_NFREEORDER - 1)) - 1);
 #endif
 		VM_OBJECT_UNLOCK(source);
 	}
 
 
 	/*
 	 * Return the new things
 	 */
 	*offset = 0;
 	*object = result;
 }
 
 /*
  *	vm_object_split:
  *
  * Split the pages in a map entry into a new object.  This affords
  * easier removal of unused pages, and keeps object inheritance from
  * being a negative impact on memory usage.
  */
 void
 vm_object_split(vm_map_entry_t entry)
 {
 	vm_page_t m, m_next;
 	vm_object_t orig_object, new_object, source;
 	vm_pindex_t idx, offidxstart;
 	vm_size_t size;
 
 	orig_object = entry->object.vm_object;
 	if (orig_object->type != OBJT_DEFAULT && orig_object->type != OBJT_SWAP)
 		return;
 	if (orig_object->ref_count <= 1)
 		return;
 	VM_OBJECT_UNLOCK(orig_object);
 
 	offidxstart = OFF_TO_IDX(entry->offset);
 	size = atop(entry->end - entry->start);
 
 	/*
 	 * If swap_pager_copy() is later called, it will convert new_object
 	 * into a swap object.
 	 */
 	new_object = vm_object_allocate(OBJT_DEFAULT, size);
 
 	/*
 	 * At this point, the new object is still private, so the order in
 	 * which the original and new objects are locked does not matter.
 	 */
 	VM_OBJECT_LOCK(new_object);
 	VM_OBJECT_LOCK(orig_object);
 	source = orig_object->backing_object;
 	if (source != NULL) {
 		VM_OBJECT_LOCK(source);
 		if ((source->flags & OBJ_DEAD) != 0) {
 			VM_OBJECT_UNLOCK(source);
 			VM_OBJECT_UNLOCK(orig_object);
 			VM_OBJECT_UNLOCK(new_object);
 			vm_object_deallocate(new_object);
 			VM_OBJECT_LOCK(orig_object);
 			return;
 		}
 		LIST_INSERT_HEAD(&source->shadow_head,
 				  new_object, shadow_list);
 		source->shadow_count++;
 		vm_object_reference_locked(source);	/* for new_object */
 		vm_object_clear_flag(source, OBJ_ONEMAPPING);
 		VM_OBJECT_UNLOCK(source);
 		new_object->backing_object_offset = 
 			orig_object->backing_object_offset + entry->offset;
 		new_object->backing_object = source;
 	}
 	if (orig_object->cred != NULL) {
 		new_object->cred = orig_object->cred;
 		crhold(orig_object->cred);
 		new_object->charge = ptoa(size);
 		KASSERT(orig_object->charge >= ptoa(size),
 		    ("orig_object->charge < 0"));
 		orig_object->charge -= ptoa(size);
 	}
 retry:
 	m = vm_page_find_least(orig_object, offidxstart);
 	for (; m != NULL && (idx = m->pindex - offidxstart) < size;
 	    m = m_next) {
 		m_next = TAILQ_NEXT(m, listq);
 
 		/*
 		 * We must wait for pending I/O to complete before we can
 		 * rename the page.
 		 *
 		 * We do not have to VM_PROT_NONE the page as mappings should
 		 * not be changed by this operation.
 		 */
 		if ((m->oflags & VPO_BUSY) || m->busy) {
 			VM_OBJECT_UNLOCK(new_object);
 			m->oflags |= VPO_WANTED;
 			msleep(m, VM_OBJECT_MTX(orig_object), PVM, "spltwt", 0);
 			VM_OBJECT_LOCK(new_object);
 			goto retry;
 		}
 #if VM_NRESERVLEVEL > 0
 		/*
 		 * If some of the reservation's allocated pages remain with
 		 * the original object, then transferring the reservation to
 		 * the new object is neither particularly beneficial nor
 		 * particularly harmful as compared to leaving the reservation
 		 * with the original object.  If, however, all of the
 		 * reservation's allocated pages are transferred to the new
 		 * object, then transferring the reservation is typically
 		 * beneficial.  Determining which of these two cases applies
 		 * would be more costly than unconditionally renaming the
 		 * reservation.
 		 */
 		vm_reserv_rename(m, new_object, orig_object, offidxstart);
 #endif
 		vm_page_lock(m);
 		vm_page_rename(m, new_object, idx);
 		vm_page_unlock(m);
 		/* page automatically made dirty by rename and cache handled */
 		vm_page_busy(m);
 	}
 	if (orig_object->type == OBJT_SWAP) {
 		/*
 		 * swap_pager_copy() can sleep, in which case the orig_object's
 		 * and new_object's locks are released and reacquired. 
 		 */
 		swap_pager_copy(orig_object, new_object, offidxstart, 0);
 
 		/*
 		 * Transfer any cached pages from orig_object to new_object.
 		 * If swap_pager_copy() found swapped out pages within the
 		 * specified range of orig_object, then it changed
 		 * new_object's type to OBJT_SWAP when it transferred those
 		 * pages to new_object.  Otherwise, new_object's type
 		 * should still be OBJT_DEFAULT and orig_object should not
 		 * contain any cached pages within the specified range.
 		 */
 		if (!vm_object_cache_is_empty(orig_object))
 			vm_page_cache_transfer(orig_object, offidxstart,
 			    new_object);
 	}
 	VM_OBJECT_UNLOCK(orig_object);
 	TAILQ_FOREACH(m, &new_object->memq, listq)
 		vm_page_wakeup(m);
 	VM_OBJECT_UNLOCK(new_object);
 	entry->object.vm_object = new_object;
 	entry->offset = 0LL;
 	vm_object_deallocate(orig_object);
 	VM_OBJECT_LOCK(new_object);
 }
 
 #define	OBSC_TEST_ALL_SHADOWED	0x0001
 #define	OBSC_COLLAPSE_NOWAIT	0x0002
 #define	OBSC_COLLAPSE_WAIT	0x0004
 
 static int
 vm_object_backing_scan(vm_object_t object, int op)
 {
 	int r = 1;
 	vm_page_t p;
 	vm_object_t backing_object;
 	vm_pindex_t backing_offset_index;
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	VM_OBJECT_LOCK_ASSERT(object->backing_object, MA_OWNED);
 
 	backing_object = object->backing_object;
 	backing_offset_index = OFF_TO_IDX(object->backing_object_offset);
 
 	/*
 	 * Initial conditions
 	 */
 	if (op & OBSC_TEST_ALL_SHADOWED) {
 		/*
 		 * We do not want to have to test for the existence of cache
 		 * or swap pages in the backing object.  XXX but with the
 		 * new swapper this would be pretty easy to do.
 		 *
 		 * XXX what about anonymous MAP_SHARED memory that hasn't
 		 * been ZFOD faulted yet?  If we do not test for this, the
 		 * shadow test may succeed! XXX
 		 */
 		if (backing_object->type != OBJT_DEFAULT) {
 			return (0);
 		}
 	}
 	if (op & OBSC_COLLAPSE_WAIT) {
 		vm_object_set_flag(backing_object, OBJ_DEAD);
 	}
 
 	/*
 	 * Our scan
 	 */
 	p = TAILQ_FIRST(&backing_object->memq);
 	while (p) {
 		vm_page_t next = TAILQ_NEXT(p, listq);
 		vm_pindex_t new_pindex = p->pindex - backing_offset_index;
 
 		if (op & OBSC_TEST_ALL_SHADOWED) {
 			vm_page_t pp;
 
 			/*
 			 * Ignore pages outside the parent object's range
 			 * and outside the parent object's mapping of the 
 			 * backing object.
 			 *
 			 * note that we do not busy the backing object's
 			 * page.
 			 */
 			if (
 			    p->pindex < backing_offset_index ||
 			    new_pindex >= object->size
 			) {
 				p = next;
 				continue;
 			}
 
 			/*
 			 * See if the parent has the page or if the parent's
 			 * object pager has the page.  If the parent has the
 			 * page but the page is not valid, the parent's
 			 * object pager must have the page.
 			 *
 			 * If this fails, the parent does not completely shadow
 			 * the object and we might as well give up now.
 			 */
 
 			pp = vm_page_lookup(object, new_pindex);
 			if (
 			    (pp == NULL || pp->valid == 0) &&
 			    !vm_pager_has_page(object, new_pindex, NULL, NULL)
 			) {
 				r = 0;
 				break;
 			}
 		}
 
 		/*
 		 * Check for busy page
 		 */
 		if (op & (OBSC_COLLAPSE_WAIT | OBSC_COLLAPSE_NOWAIT)) {
 			vm_page_t pp;
 
 			if (op & OBSC_COLLAPSE_NOWAIT) {
 				if ((p->oflags & VPO_BUSY) ||
 				    !p->valid || 
 				    p->busy) {
 					p = next;
 					continue;
 				}
 			} else if (op & OBSC_COLLAPSE_WAIT) {
 				if ((p->oflags & VPO_BUSY) || p->busy) {
 					VM_OBJECT_UNLOCK(object);
 					p->oflags |= VPO_WANTED;
 					msleep(p, VM_OBJECT_MTX(backing_object),
 					    PDROP | PVM, "vmocol", 0);
 					VM_OBJECT_LOCK(object);
 					VM_OBJECT_LOCK(backing_object);
 					/*
 					 * If we slept, anything could have
 					 * happened.  Since the object is
 					 * marked dead, the backing offset
 					 * should not have changed so we
 					 * just restart our scan.
 					 */
 					p = TAILQ_FIRST(&backing_object->memq);
 					continue;
 				}
 			}
 
 			KASSERT(
 			    p->object == backing_object,
 			    ("vm_object_backing_scan: object mismatch")
 			);
 
 			/*
 			 * Destroy any associated swap
 			 */
 			if (backing_object->type == OBJT_SWAP) {
 				swap_pager_freespace(
 				    backing_object, 
 				    p->pindex,
 				    1
 				);
 			}
 
 			if (
 			    p->pindex < backing_offset_index ||
 			    new_pindex >= object->size
 			) {
 				/*
 				 * Page is out of the parent object's range, we 
 				 * can simply destroy it. 
 				 */
 				vm_page_lock(p);
 				KASSERT(!pmap_page_is_mapped(p),
 				    ("freeing mapped page %p", p));
 				if (p->wire_count == 0)
 					vm_page_free(p);
 				else
 					vm_page_remove(p);
 				vm_page_unlock(p);
 				p = next;
 				continue;
 			}
 
 			pp = vm_page_lookup(object, new_pindex);
 			if (
 			    (op & OBSC_COLLAPSE_NOWAIT) != 0 &&
 			    (pp != NULL && pp->valid == 0)
 			) {
 				/*
 				 * The page in the parent is not (yet) valid.
 				 * We don't know anything about the state of
 				 * the original page.  It might be mapped,
 				 * so we must avoid the next if here.
 				 *
 				 * This is due to a race in vm_fault() where
 				 * we must unbusy the original (backing_obj)
 				 * page before we can (re)lock the parent.
 				 * Hence we can get here.
 				 */
 				p = next;
 				continue;
 			}
 			if (
 			    pp != NULL ||
 			    vm_pager_has_page(object, new_pindex, NULL, NULL)
 			) {
 				/*
 				 * page already exists in parent OR swap exists
 				 * for this location in the parent.  Destroy 
 				 * the original page from the backing object.
 				 *
 				 * Leave the parent's page alone
 				 */
 				vm_page_lock(p);
 				KASSERT(!pmap_page_is_mapped(p),
 				    ("freeing mapped page %p", p));
 				if (p->wire_count == 0)
 					vm_page_free(p);
 				else
 					vm_page_remove(p);
 				vm_page_unlock(p);
 				p = next;
 				continue;
 			}
 
 #if VM_NRESERVLEVEL > 0
 			/*
 			 * Rename the reservation.
 			 */
 			vm_reserv_rename(p, object, backing_object,
 			    backing_offset_index);
 #endif
 
 			/*
 			 * Page does not exist in parent, rename the
 			 * page from the backing object to the main object. 
 			 *
 			 * If the page was mapped to a process, it can remain 
 			 * mapped through the rename.
 			 */
 			vm_page_lock(p);
 			vm_page_rename(p, object, new_pindex);
 			vm_page_unlock(p);
 			/* page automatically made dirty by rename */
 		}
 		p = next;
 	}
 	return (r);
 }
 
 
 /*
  * this version of collapse allows the operation to occur earlier and
  * when paging_in_progress is true for an object...  This is not a complete
  * operation, but should plug 99.9% of the rest of the leaks.
  */
 static void
 vm_object_qcollapse(vm_object_t object)
 {
 	vm_object_t backing_object = object->backing_object;
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	VM_OBJECT_LOCK_ASSERT(backing_object, MA_OWNED);
 
 	if (backing_object->ref_count != 1)
 		return;
 
 	vm_object_backing_scan(object, OBSC_COLLAPSE_NOWAIT);
 }
 
 /*
  *	vm_object_collapse:
  *
  *	Collapse an object with the object backing it.
  *	Pages in the backing object are moved into the
  *	parent, and the backing object is deallocated.
  */
 void
 vm_object_collapse(vm_object_t object)
 {
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	
 	while (TRUE) {
 		vm_object_t backing_object;
 
 		/*
 		 * Verify that the conditions are right for collapse:
 		 *
 		 * The object exists and the backing object exists.
 		 */
 		if ((backing_object = object->backing_object) == NULL)
 			break;
 
 		/*
 		 * we check the backing object first, because it is most likely
 		 * not collapsable.
 		 */
 		VM_OBJECT_LOCK(backing_object);
 		if (backing_object->handle != NULL ||
 		    (backing_object->type != OBJT_DEFAULT &&
 		     backing_object->type != OBJT_SWAP) ||
 		    (backing_object->flags & OBJ_DEAD) ||
 		    object->handle != NULL ||
 		    (object->type != OBJT_DEFAULT &&
 		     object->type != OBJT_SWAP) ||
 		    (object->flags & OBJ_DEAD)) {
 			VM_OBJECT_UNLOCK(backing_object);
 			break;
 		}
 
 		if (
 		    object->paging_in_progress != 0 ||
 		    backing_object->paging_in_progress != 0
 		) {
 			vm_object_qcollapse(object);
 			VM_OBJECT_UNLOCK(backing_object);
 			break;
 		}
 		/*
 		 * We know that we can either collapse the backing object (if
 		 * the parent is the only reference to it) or (perhaps) have
 		 * the parent bypass the object if the parent happens to shadow
 		 * all the resident pages in the entire backing object.
 		 *
 		 * This is ignoring pager-backed pages such as swap pages.
 		 * vm_object_backing_scan fails the shadowing test in this
 		 * case.
 		 */
 		if (backing_object->ref_count == 1) {
 			/*
 			 * If there is exactly one reference to the backing
 			 * object, we can collapse it into the parent.  
 			 */
 			vm_object_backing_scan(object, OBSC_COLLAPSE_WAIT);
 
 #if VM_NRESERVLEVEL > 0
 			/*
 			 * Break any reservations from backing_object.
 			 */
 			if (__predict_false(!LIST_EMPTY(&backing_object->rvq)))
 				vm_reserv_break_all(backing_object);
 #endif
 
 			/*
 			 * Move the pager from backing_object to object.
 			 */
 			if (backing_object->type == OBJT_SWAP) {
 				/*
 				 * swap_pager_copy() can sleep, in which case
 				 * the backing_object's and object's locks are
 				 * released and reacquired.
 				 * Since swap_pager_copy() is being asked to
 				 * destroy the source, it will change the
 				 * backing_object's type to OBJT_DEFAULT.
 				 */
 				swap_pager_copy(
 				    backing_object,
 				    object,
 				    OFF_TO_IDX(object->backing_object_offset), TRUE);
 
 				/*
 				 * Free any cached pages from backing_object.
 				 */
 				if (!vm_object_cache_is_empty(backing_object))
 					vm_page_cache_free(backing_object, 0, 0);
 			}
 			/*
 			 * Object now shadows whatever backing_object did.
 			 * Note that the reference to 
 			 * backing_object->backing_object moves from within 
 			 * backing_object to within object.
 			 */
 			LIST_REMOVE(object, shadow_list);
 			backing_object->shadow_count--;
 			if (backing_object->backing_object) {
 				VM_OBJECT_LOCK(backing_object->backing_object);
 				LIST_REMOVE(backing_object, shadow_list);
 				LIST_INSERT_HEAD(
 				    &backing_object->backing_object->shadow_head,
 				    object, shadow_list);
 				/*
 				 * The shadow_count has not changed.
 				 */
 				VM_OBJECT_UNLOCK(backing_object->backing_object);
 			}
 			object->backing_object = backing_object->backing_object;
 			object->backing_object_offset +=
 			    backing_object->backing_object_offset;
 
 			/*
 			 * Discard backing_object.
 			 *
 			 * Since the backing object has no pages, no pager left,
 			 * and no object references within it, all that is
 			 * necessary is to dispose of it.
 			 */
 			KASSERT(backing_object->ref_count == 1, (
 "backing_object %p was somehow re-referenced during collapse!",
 			    backing_object));
 			VM_OBJECT_UNLOCK(backing_object);
 			vm_object_destroy(backing_object);
 
 			object_collapses++;
 		} else {
 			vm_object_t new_backing_object;
 
 			/*
 			 * If we do not entirely shadow the backing object,
 			 * there is nothing we can do so we give up.
 			 */
 			if (object->resident_page_count != object->size &&
 			    vm_object_backing_scan(object,
 			    OBSC_TEST_ALL_SHADOWED) == 0) {
 				VM_OBJECT_UNLOCK(backing_object);
 				break;
 			}
 
 			/*
 			 * Make the parent shadow the next object in the
 			 * chain.  Deallocating backing_object will not remove
 			 * it, since its reference count is at least 2.
 			 */
 			LIST_REMOVE(object, shadow_list);
 			backing_object->shadow_count--;
 
 			new_backing_object = backing_object->backing_object;
 			if ((object->backing_object = new_backing_object) != NULL) {
 				VM_OBJECT_LOCK(new_backing_object);
 				LIST_INSERT_HEAD(
 				    &new_backing_object->shadow_head,
 				    object,
 				    shadow_list
 				);
 				new_backing_object->shadow_count++;
 				vm_object_reference_locked(new_backing_object);
 				VM_OBJECT_UNLOCK(new_backing_object);
 				object->backing_object_offset +=
 					backing_object->backing_object_offset;
 			}
 
 			/*
 			 * Drop the reference count on backing_object. Since
 			 * its ref_count was at least 2, it will not vanish.
 			 */
 			backing_object->ref_count--;
 			VM_OBJECT_UNLOCK(backing_object);
 			object_bypasses++;
 		}
 
 		/*
 		 * Try again with this object's new backing object.
 		 */
 	}
 }
 
 /*
  *	vm_object_page_remove:
  *
  *	For the given object, either frees or invalidates each of the
  *	specified pages.  In general, a page is freed.  However, if a page is
  *	wired for any reason other than the existence of a managed, wired
  *	mapping, then it may be invalidated but not removed from the object.
  *	Pages are specified by the given range ["start", "end") and the option
  *	OBJPR_CLEANONLY.  As a special case, if "end" is zero, then the range
  *	extends from "start" to the end of the object.  If the option
  *	OBJPR_CLEANONLY is specified, then only the non-dirty pages within the
  *	specified range are affected.  If the option OBJPR_NOTMAPPED is
  *	specified, then the pages within the specified range must have no
  *	mappings.  Otherwise, if this option is not specified, any mappings to
  *	the specified pages are removed before the pages are freed or
  *	invalidated.
  *
  *	In general, this operation should only be performed on objects that
  *	contain managed pages.  There are, however, two exceptions.  First, it
  *	is performed on the kernel and kmem objects by vm_map_entry_delete().
  *	Second, it is used by msync(..., MS_INVALIDATE) to invalidate device-
  *	backed pages.  In both of these cases, the option OBJPR_CLEANONLY must
  *	not be specified and the option OBJPR_NOTMAPPED must be specified.
  *
  *	The object must be locked.
  */
 void
 vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end,
     int options)
 {
 	vm_page_t p, next;
 	int wirings;
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	KASSERT((object->flags & OBJ_UNMANAGED) == 0 ||
 	    (options & (OBJPR_CLEANONLY | OBJPR_NOTMAPPED)) == OBJPR_NOTMAPPED,
 	    ("vm_object_page_remove: illegal options for object %p", object));
 	if (object->resident_page_count == 0)
 		goto skipmemq;
 	vm_object_pip_add(object, 1);
 again:
 	p = vm_page_find_least(object, start);
 
 	/*
 	 * Here, the variable "p" is either (1) the page with the least pindex
 	 * greater than or equal to the parameter "start" or (2) NULL. 
 	 */
 	for (; p != NULL && (p->pindex < end || end == 0); p = next) {
 		next = TAILQ_NEXT(p, listq);
 
 		/*
 		 * If the page is wired for any reason besides the existence
 		 * of managed, wired mappings, then it cannot be freed.  For
 		 * example, fictitious pages, which represent device memory,
 		 * are inherently wired and cannot be freed.  They can,
 		 * however, be invalidated if the option OBJPR_CLEANONLY is
 		 * not specified.
 		 */
 		vm_page_lock(p);
 		if ((wirings = p->wire_count) != 0 &&
 		    (wirings = pmap_page_wired_mappings(p)) != p->wire_count) {
 			if ((options & OBJPR_NOTMAPPED) == 0) {
 				pmap_remove_all(p);
 				/* Account for removal of wired mappings. */
 				if (wirings != 0)
 					p->wire_count -= wirings;
 			}
 			if ((options & OBJPR_CLEANONLY) == 0) {
 				p->valid = 0;
 				vm_page_undirty(p);
 			}
 			vm_page_unlock(p);
 			continue;
 		}
 		if (vm_page_sleep_if_busy(p, TRUE, "vmopar"))
 			goto again;
 		KASSERT((p->flags & PG_FICTITIOUS) == 0,
 		    ("vm_object_page_remove: page %p is fictitious", p));
 		if ((options & OBJPR_CLEANONLY) != 0 && p->valid != 0) {
 			if ((options & OBJPR_NOTMAPPED) == 0)
 				pmap_remove_write(p);
 			if (p->dirty) {
 				vm_page_unlock(p);
 				continue;
 			}
 		}
 		if ((options & OBJPR_NOTMAPPED) == 0) {
 			pmap_remove_all(p);
 			/* Account for removal of wired mappings. */
 			if (wirings != 0) {
 				KASSERT(p->wire_count == wirings,
 				    ("inconsistent wire count %d %d %p",
 				    p->wire_count, wirings, p));
 				p->wire_count = 0;
 				atomic_subtract_int(&cnt.v_wire_count, 1);
 			}
 		}
 		vm_page_free(p);
 		vm_page_unlock(p);
 	}
 	vm_object_pip_wakeup(object);
 skipmemq:
 	if (!vm_object_cache_is_empty(object))
 		vm_page_cache_free(object, start, end);
 }
 
 /*
  *	vm_object_page_cache:
  *
  *	For the given object, attempt to move the specified clean
  *	pages to the cache queue.  If a page is wired for any reason,
  *	then it will not be changed.  Pages are specified by the given
  *	range ["start", "end").  As a special case, if "end" is zero,
  *	then the range extends from "start" to the end of the object.
  *	Any mappings to the specified pages are removed before the
  *	pages are moved to the cache queue.
  *
  *	This operation should only be performed on objects that
  *	contain non-fictitious, managed pages.
  *
  *	The object must be locked.
  */
 void
 vm_object_page_cache(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
 {
 	struct mtx *mtx, *new_mtx;
 	vm_page_t p, next;
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	KASSERT((object->flags & (OBJ_FICTITIOUS | OBJ_UNMANAGED)) == 0,
 	    ("vm_object_page_cache: illegal object %p", object));
 	if (object->resident_page_count == 0)
 		return;
 	p = vm_page_find_least(object, start);
 
 	/*
 	 * Here, the variable "p" is either (1) the page with the least pindex
 	 * greater than or equal to the parameter "start" or (2) NULL. 
 	 */
 	mtx = NULL;
 	for (; p != NULL && (p->pindex < end || end == 0); p = next) {
 		next = TAILQ_NEXT(p, listq);
 
 		/*
 		 * Avoid releasing and reacquiring the same page lock.
 		 */
 		new_mtx = vm_page_lockptr(p);
 		if (mtx != new_mtx) {
 			if (mtx != NULL)
 				mtx_unlock(mtx);
 			mtx = new_mtx;
 			mtx_lock(mtx);
 		}
 		vm_page_try_to_cache(p);
 	}
 	if (mtx != NULL)
 		mtx_unlock(mtx);
 }
 
 /*
  *	Populate the specified range of the object with valid pages.  Returns
  *	TRUE if the range is successfully populated and FALSE otherwise.
  *
  *	Note: This function should be optimized to pass a larger array of
  *	pages to vm_pager_get_pages() before it is applied to a non-
  *	OBJT_DEVICE object.
  *
  *	The object must be locked.
  */
 boolean_t
 vm_object_populate(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
 {
 	vm_page_t m, ma[1];
 	vm_pindex_t pindex;
 	int rv;
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	for (pindex = start; pindex < end; pindex++) {
 		m = vm_page_grab(object, pindex, VM_ALLOC_NORMAL |
 		    VM_ALLOC_RETRY);
 		if (m->valid != VM_PAGE_BITS_ALL) {
 			ma[0] = m;
 			rv = vm_pager_get_pages(object, ma, 1, 0);
 			m = vm_page_lookup(object, pindex);
 			if (m == NULL)
 				break;
 			if (rv != VM_PAGER_OK) {
 				vm_page_lock(m);
 				vm_page_free(m);
 				vm_page_unlock(m);
 				break;
 			}
 		}
 		/*
 		 * Keep "m" busy because a subsequent iteration may unlock
 		 * the object.
 		 */
 	}
 	if (pindex > start) {
 		m = vm_page_lookup(object, start);
 		while (m != NULL && m->pindex < pindex) {
 			vm_page_wakeup(m);
 			m = TAILQ_NEXT(m, listq);
 		}
 	}
 	return (pindex == end);
 }
 
 /*
  *	Routine:	vm_object_coalesce
  *	Function:	Coalesces two objects backing up adjoining
  *			regions of memory into a single object.
  *
  *	returns TRUE if objects were combined.
  *
  *	NOTE:	Only works at the moment if the second object is NULL -
  *		if it's not, which object do we lock first?
  *
  *	Parameters:
  *		prev_object	First object to coalesce
  *		prev_offset	Offset into prev_object
  *		prev_size	Size of reference to prev_object
  *		next_size	Size of reference to the second object
  *		reserved	Indicator that extension region has
  *				swap accounted for
  *
  *	Conditions:
  *	The object must *not* be locked.
  */
 boolean_t
 vm_object_coalesce(vm_object_t prev_object, vm_ooffset_t prev_offset,
     vm_size_t prev_size, vm_size_t next_size, boolean_t reserved)
 {
 	vm_pindex_t next_pindex;
 
 	if (prev_object == NULL)
 		return (TRUE);
 	VM_OBJECT_LOCK(prev_object);
 	if (prev_object->type != OBJT_DEFAULT &&
 	    prev_object->type != OBJT_SWAP) {
 		VM_OBJECT_UNLOCK(prev_object);
 		return (FALSE);
 	}
 
 	/*
 	 * Try to collapse the object first
 	 */
 	vm_object_collapse(prev_object);
 
 	/*
 	 * Can't coalesce if: . more than one reference . paged out . shadows
 	 * another object . has a copy elsewhere (any of which mean that the
 	 * pages not mapped to prev_entry may be in use anyway)
 	 */
 	if (prev_object->backing_object != NULL) {
 		VM_OBJECT_UNLOCK(prev_object);
 		return (FALSE);
 	}
 
 	prev_size >>= PAGE_SHIFT;
 	next_size >>= PAGE_SHIFT;
 	next_pindex = OFF_TO_IDX(prev_offset) + prev_size;
 
 	if ((prev_object->ref_count > 1) &&
 	    (prev_object->size != next_pindex)) {
 		VM_OBJECT_UNLOCK(prev_object);
 		return (FALSE);
 	}
 
 	/*
 	 * Account for the charge.
 	 */
 	if (prev_object->cred != NULL) {
 
 		/*
 		 * If prev_object was charged, then this mapping,
 		 * althought not charged now, may become writable
 		 * later. Non-NULL cred in the object would prevent
 		 * swap reservation during enabling of the write
 		 * access, so reserve swap now. Failed reservation
 		 * cause allocation of the separate object for the map
 		 * entry, and swap reservation for this entry is
 		 * managed in appropriate time.
 		 */
 		if (!reserved && !swap_reserve_by_cred(ptoa(next_size),
 		    prev_object->cred)) {
 			return (FALSE);
 		}
 		prev_object->charge += ptoa(next_size);
 	}
 
 	/*
 	 * Remove any pages that may still be in the object from a previous
 	 * deallocation.
 	 */
 	if (next_pindex < prev_object->size) {
 		vm_object_page_remove(prev_object, next_pindex, next_pindex +
 		    next_size, 0);
 		if (prev_object->type == OBJT_SWAP)
 			swap_pager_freespace(prev_object,
 					     next_pindex, next_size);
 #if 0
 		if (prev_object->cred != NULL) {
 			KASSERT(prev_object->charge >=
 			    ptoa(prev_object->size - next_pindex),
 			    ("object %p overcharged 1 %jx %jx", prev_object,
 				(uintmax_t)next_pindex, (uintmax_t)next_size));
 			prev_object->charge -= ptoa(prev_object->size -
 			    next_pindex);
 		}
 #endif
 	}
 
 	/*
 	 * Extend the object if necessary.
 	 */
 	if (next_pindex + next_size > prev_object->size)
 		prev_object->size = next_pindex + next_size;
 
 	VM_OBJECT_UNLOCK(prev_object);
 	return (TRUE);
 }
 
 void
 vm_object_set_writeable_dirty(vm_object_t object)
 {
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	if (object->type != OBJT_VNODE)
 		return;
 	object->generation++;
 	if ((object->flags & OBJ_MIGHTBEDIRTY) != 0)
 		return;
 	vm_object_set_flag(object, OBJ_MIGHTBEDIRTY);
 }
 
 #include "opt_ddb.h"
 #ifdef DDB
 #include <sys/kernel.h>
 
 #include <sys/cons.h>
 
 #include <ddb/ddb.h>
 
 static int
 _vm_object_in_map(vm_map_t map, vm_object_t object, vm_map_entry_t entry)
 {
 	vm_map_t tmpm;
 	vm_map_entry_t tmpe;
 	vm_object_t obj;
 	int entcount;
 
 	if (map == 0)
 		return 0;
 
 	if (entry == 0) {
 		tmpe = map->header.next;
 		entcount = map->nentries;
 		while (entcount-- && (tmpe != &map->header)) {
 			if (_vm_object_in_map(map, object, tmpe)) {
 				return 1;
 			}
 			tmpe = tmpe->next;
 		}
 	} else if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) {
 		tmpm = entry->object.sub_map;
 		tmpe = tmpm->header.next;
 		entcount = tmpm->nentries;
 		while (entcount-- && tmpe != &tmpm->header) {
 			if (_vm_object_in_map(tmpm, object, tmpe)) {
 				return 1;
 			}
 			tmpe = tmpe->next;
 		}
 	} else if ((obj = entry->object.vm_object) != NULL) {
 		for (; obj; obj = obj->backing_object)
 			if (obj == object) {
 				return 1;
 			}
 	}
 	return 0;
 }
 
 static int
 vm_object_in_map(vm_object_t object)
 {
 	struct proc *p;
 
 	/* sx_slock(&allproc_lock); */
 	FOREACH_PROC_IN_SYSTEM(p) {
 		if (!p->p_vmspace /* || (p->p_flag & (P_SYSTEM|P_WEXIT)) */)
 			continue;
 		if (_vm_object_in_map(&p->p_vmspace->vm_map, object, 0)) {
 			/* sx_sunlock(&allproc_lock); */
 			return 1;
 		}
 	}
 	/* sx_sunlock(&allproc_lock); */
 	if (_vm_object_in_map(kernel_map, object, 0))
 		return 1;
 	if (_vm_object_in_map(kmem_map, object, 0))
 		return 1;
 	if (_vm_object_in_map(pager_map, object, 0))
 		return 1;
 	if (_vm_object_in_map(buffer_map, object, 0))
 		return 1;
 	return 0;
 }
 
 DB_SHOW_COMMAND(vmochk, vm_object_check)
 {
 	vm_object_t object;
 
 	/*
 	 * make sure that internal objs are in a map somewhere
 	 * and none have zero ref counts.
 	 */
 	TAILQ_FOREACH(object, &vm_object_list, object_list) {
 		if (object->handle == NULL &&
 		    (object->type == OBJT_DEFAULT || object->type == OBJT_SWAP)) {
 			if (object->ref_count == 0) {
 				db_printf("vmochk: internal obj has zero ref count: %ld\n",
 					(long)object->size);
 			}
 			if (!vm_object_in_map(object)) {
 				db_printf(
 			"vmochk: internal obj is not in a map: "
 			"ref: %d, size: %lu: 0x%lx, backing_object: %p\n",
 				    object->ref_count, (u_long)object->size, 
 				    (u_long)object->size,
 				    (void *)object->backing_object);
 			}
 		}
 	}
 }
 
 /*
  *	vm_object_print:	[ debug ]
  */
 DB_SHOW_COMMAND(object, vm_object_print_static)
 {
 	/* XXX convert args. */
 	vm_object_t object = (vm_object_t)addr;
 	boolean_t full = have_addr;
 
 	vm_page_t p;
 
 	/* XXX count is an (unused) arg.  Avoid shadowing it. */
 #define	count	was_count
 
 	int count;
 
 	if (object == NULL)
 		return;
 
 	db_iprintf(
 	    "Object %p: type=%d, size=0x%jx, res=%d, ref=%d, flags=0x%x ruid %d charge %jx\n",
 	    object, (int)object->type, (uintmax_t)object->size,
 	    object->resident_page_count, object->ref_count, object->flags,
 	    object->cred ? object->cred->cr_ruid : -1, (uintmax_t)object->charge);
 	db_iprintf(" sref=%d, backing_object(%d)=(%p)+0x%jx\n",
 	    object->shadow_count, 
 	    object->backing_object ? object->backing_object->ref_count : 0,
 	    object->backing_object, (uintmax_t)object->backing_object_offset);
 
 	if (!full)
 		return;
 
 	db_indent += 2;
 	count = 0;
 	TAILQ_FOREACH(p, &object->memq, listq) {
 		if (count == 0)
 			db_iprintf("memory:=");
 		else if (count == 6) {
 			db_printf("\n");
 			db_iprintf(" ...");
 			count = 0;
 		} else
 			db_printf(",");
 		count++;
 
 		db_printf("(off=0x%jx,page=0x%jx)",
 		    (uintmax_t)p->pindex, (uintmax_t)VM_PAGE_TO_PHYS(p));
 	}
 	if (count != 0)
 		db_printf("\n");
 	db_indent -= 2;
 }
 
 /* XXX. */
 #undef count
 
 /* XXX need this non-static entry for calling from vm_map_print. */
 void
 vm_object_print(
         /* db_expr_t */ long addr,
 	boolean_t have_addr,
 	/* db_expr_t */ long count,
 	char *modif)
 {
 	vm_object_print_static(addr, have_addr, count, modif);
 }
 
 DB_SHOW_COMMAND(vmopag, vm_object_print_pages)
 {
 	vm_object_t object;
 	vm_pindex_t fidx;
 	vm_paddr_t pa;
 	vm_page_t m, prev_m;
 	int rcount, nl, c;
 
 	nl = 0;
 	TAILQ_FOREACH(object, &vm_object_list, object_list) {
 		db_printf("new object: %p\n", (void *)object);
 		if (nl > 18) {
 			c = cngetc();
 			if (c != ' ')
 				return;
 			nl = 0;
 		}
 		nl++;
 		rcount = 0;
 		fidx = 0;
 		pa = -1;
 		TAILQ_FOREACH(m, &object->memq, listq) {
 			if (m->pindex > 128)
 				break;
 			if ((prev_m = TAILQ_PREV(m, pglist, listq)) != NULL &&
 			    prev_m->pindex + 1 != m->pindex) {
 				if (rcount) {
 					db_printf(" index(%ld)run(%d)pa(0x%lx)\n",
 						(long)fidx, rcount, (long)pa);
 					if (nl > 18) {
 						c = cngetc();
 						if (c != ' ')
 							return;
 						nl = 0;
 					}
 					nl++;
 					rcount = 0;
 				}
 			}				
 			if (rcount &&
 				(VM_PAGE_TO_PHYS(m) == pa + rcount * PAGE_SIZE)) {
 				++rcount;
 				continue;
 			}
 			if (rcount) {
 				db_printf(" index(%ld)run(%d)pa(0x%lx)\n",
 					(long)fidx, rcount, (long)pa);
 				if (nl > 18) {
 					c = cngetc();
 					if (c != ' ')
 						return;
 					nl = 0;
 				}
 				nl++;
 			}
 			fidx = m->pindex;
 			pa = VM_PAGE_TO_PHYS(m);
 			rcount = 1;
 		}
 		if (rcount) {
 			db_printf(" index(%ld)run(%d)pa(0x%lx)\n",
 				(long)fidx, rcount, (long)pa);
 			if (nl > 18) {
 				c = cngetc();
 				if (c != ' ')
 					return;
 				nl = 0;
 			}
 			nl++;
 		}
 	}
 }
 #endif /* DDB */
Index: user/attilio/vmc-playground/sys/vm/vm_object.h
===================================================================
--- user/attilio/vmc-playground/sys/vm/vm_object.h	(revision 247223)
+++ user/attilio/vmc-playground/sys/vm/vm_object.h	(revision 247224)
@@ -1,271 +1,267 @@
 /*-
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * The Mach Operating System project at Carnegie-Mellon University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)vm_object.h	8.3 (Berkeley) 1/12/94
  *
  *
  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  * All rights reserved.
  *
  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
  *
  * Permission to use, copy, modify and distribute this software and
  * its documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  *
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
  * $FreeBSD$
  */
 
 /*
  *	Virtual memory object module definitions.
  */
 
 #ifndef	_VM_OBJECT_
 #define	_VM_OBJECT_
 
 #include <sys/queue.h>
 #include <sys/_lock.h>
 #include <sys/_mutex.h>
 
 #include <vm/_vm_radix.h>
 
 /*
  *	Types defined:
  *
  *	vm_object_t		Virtual memory object.
  *
  *	The root of cached pages pool is protected by both the per-object mutex
  *	and the free pages queue mutex.
  *	On insert in the cache radix trie, the per-object mutex is expected
  *	to be already held and the free pages queue mutex will be
  *	acquired during the operation too.
  *	On remove and lookup from the cache radix trie, only the free
  *	pages queue mutex is expected to be locked.
  *	These rules allow for reliably checking for the presence of cached
  *	pages with only the per-object lock held, thereby reducing contention
  *	for the free pages queue mutex.
  *
  * List of locks
  *	(c)	const until freed
  *	(o)	per-object mutex
  *	(f)	free pages queue mutex
  *
  */
 
 struct vm_object {
 	struct mtx mtx;
 	TAILQ_ENTRY(vm_object) object_list; /* list of all objects */
 	LIST_HEAD(, vm_object) shadow_head; /* objects that this is a shadow for */
 	LIST_ENTRY(vm_object) shadow_list; /* chain of shadow objects */
 	TAILQ_HEAD(, vm_page) memq;	/* list of resident pages */
 	struct vm_radix rtree;		/* root of the resident page radix trie*/
 	vm_pindex_t size;		/* Object size */
 	int generation;			/* generation ID */
 	int ref_count;			/* How many refs?? */
 	int shadow_count;		/* how many objects that this is a shadow for */
 	vm_memattr_t memattr;		/* default memory attribute for pages */
 	objtype_t type;			/* type of pager */
 	u_short flags;			/* see below */
 	u_short pg_color;		/* (c) color of first page in obj */
 	u_int paging_in_progress;	/* Paging (in or out) so don't collapse or destroy */
 	int resident_page_count;	/* number of resident pages */
 	struct vm_object *backing_object; /* object that I'm a shadow of */
 	vm_ooffset_t backing_object_offset;/* Offset in backing object */
 	TAILQ_ENTRY(vm_object) pager_object_list; /* list of all objects of this pager type */
 	LIST_HEAD(, vm_reserv) rvq;	/* list of reservations */
 	struct vm_radix cache;		/* (o + f) root of the cache page radix trie */
 	void *handle;
 	union {
 		/*
 		 * VNode pager
 		 *
 		 *	vnp_size - current size of file
 		 */
 		struct {
 			off_t vnp_size;
 			vm_ooffset_t writemappings;
 		} vnp;
 
 		/*
 		 * Device pager
 		 *
 		 *	devp_pglist - list of allocated pages
 		 */
 		struct {
 			TAILQ_HEAD(, vm_page) devp_pglist;
 			struct cdev_pager_ops *ops;
 			struct cdev *dev;
 		} devp;
 
 		/*
 		 * SG pager
 		 *
 		 *	sgp_pglist - list of allocated pages
 		 */
 		struct {
 			TAILQ_HEAD(, vm_page) sgp_pglist;
 		} sgp;
 
 		/*
 		 * Swap pager
 		 *
 		 *	swp_bcount - number of swap 'swblock' metablocks, each
 		 *		     contains up to 16 swapblk assignments.
 		 *		     see vm/swap_pager.h
 		 */
 		struct {
 			int swp_bcount;
 		} swp;
 	} un_pager;
 	struct ucred *cred;
 	vm_ooffset_t charge;
 };
 
 /*
  * Flags
  */
 #define	OBJ_FICTITIOUS	0x0001		/* (c) contains fictitious pages */
 #define	OBJ_UNMANAGED	0x0002		/* (c) contains unmanaged pages */
 #define OBJ_ACTIVE	0x0004		/* active objects */
 #define OBJ_DEAD	0x0008		/* dead objects (during rundown) */
 #define	OBJ_NOSPLIT	0x0010		/* dont split this object */
 #define OBJ_PIPWNT	0x0040		/* paging in progress wanted */
 #define OBJ_MIGHTBEDIRTY 0x0100		/* object might be dirty, only for vnode */
 #define	OBJ_COLORED	0x1000		/* pg_color is defined */
 #define	OBJ_ONEMAPPING	0x2000		/* One USE (a single, non-forked) mapping flag */
 #define	OBJ_DISCONNECTWNT 0x4000	/* disconnect from vnode wanted */
 
 #define IDX_TO_OFF(idx) (((vm_ooffset_t)(idx)) << PAGE_SHIFT)
 #define OFF_TO_IDX(off) ((vm_pindex_t)(((vm_ooffset_t)(off)) >> PAGE_SHIFT))
 
 #ifdef	_KERNEL
 
 #define OBJPC_SYNC	0x1			/* sync I/O */
 #define OBJPC_INVAL	0x2			/* invalidate */
 #define OBJPC_NOSYNC	0x4			/* skip if PG_NOSYNC */
 
 /*
  * The following options are supported by vm_object_page_remove().
  */
 #define	OBJPR_CLEANONLY	0x1		/* Don't remove dirty pages. */
 #define	OBJPR_NOTMAPPED	0x2		/* Don't unmap pages. */
 
 TAILQ_HEAD(object_q, vm_object);
 
 extern struct object_q vm_object_list;	/* list of allocated objects */
 extern struct mtx vm_object_list_mtx;	/* lock for object list and count */
 
 extern struct vm_object kernel_object_store;
 extern struct vm_object kmem_object_store;
 
 #define	kernel_object	(&kernel_object_store)
 #define	kmem_object	(&kmem_object_store)
 
 #define	VM_OBJECT_LOCK(object)		mtx_lock(&(object)->mtx)
 #define	VM_OBJECT_LOCK_ASSERT(object, type) \
 					mtx_assert(&(object)->mtx, (type))
-#define	VM_OBJECT_LOCK_INIT(object, type) \
-					mtx_init(&(object)->mtx, "vm object", \
-					    (type), MTX_DEF | MTX_DUPOK)
 #define	VM_OBJECT_LOCKED(object)	mtx_owned(&(object)->mtx)
 #define	VM_OBJECT_MTX(object)		(&(object)->mtx)
 #define	VM_OBJECT_TRYLOCK(object)	mtx_trylock(&(object)->mtx)
 #define	VM_OBJECT_UNLOCK(object)	mtx_unlock(&(object)->mtx)
 
 /*
  *	The object must be locked or thread private.
  */
 static __inline void
 vm_object_set_flag(vm_object_t object, u_short bits)
 {
 
 	object->flags |= bits;
 }
 
 void vm_object_clear_flag(vm_object_t object, u_short bits);
 void vm_object_pip_add(vm_object_t object, short i);
 void vm_object_pip_subtract(vm_object_t object, short i);
 void vm_object_pip_wakeup(vm_object_t object);
 void vm_object_pip_wakeupn(vm_object_t object, short i);
 void vm_object_pip_wait(vm_object_t object, char *waitid);
 
 static __inline boolean_t
 vm_object_cache_is_empty(vm_object_t object)
 {
 
 	return (__predict_true(object->cache.rt_root == 0));
 }
 
 vm_object_t vm_object_allocate (objtype_t, vm_pindex_t);
-void _vm_object_allocate (objtype_t, vm_pindex_t, vm_object_t);
 boolean_t vm_object_coalesce(vm_object_t, vm_ooffset_t, vm_size_t, vm_size_t,
    boolean_t);
 void vm_object_collapse (vm_object_t);
 void vm_object_deallocate (vm_object_t);
 void vm_object_destroy (vm_object_t);
 void vm_object_terminate (vm_object_t);
 void vm_object_set_writeable_dirty (vm_object_t);
 void vm_object_init (void);
 void vm_object_madvise(vm_object_t, vm_pindex_t, vm_pindex_t, int);
 void vm_object_page_cache(vm_object_t object, vm_pindex_t start,
     vm_pindex_t end);
 boolean_t vm_object_page_clean(vm_object_t object, vm_ooffset_t start,
     vm_ooffset_t end, int flags);
 void vm_object_page_remove(vm_object_t object, vm_pindex_t start,
     vm_pindex_t end, int options);
 boolean_t vm_object_populate(vm_object_t, vm_pindex_t, vm_pindex_t);
 void vm_object_print(long addr, boolean_t have_addr, long count, char *modif);
 void vm_object_reference (vm_object_t);
 void vm_object_reference_locked(vm_object_t);
 int  vm_object_set_memattr(vm_object_t object, vm_memattr_t memattr);
 void vm_object_shadow (vm_object_t *, vm_ooffset_t *, vm_size_t);
 void vm_object_split(vm_map_entry_t);
 boolean_t vm_object_sync(vm_object_t, vm_ooffset_t, vm_size_t, boolean_t,
     boolean_t);
 #endif				/* _KERNEL */
 
 #endif				/* _VM_OBJECT_ */
Index: user/attilio/vmc-playground/sys/vm/vm_radix.c
===================================================================
--- user/attilio/vmc-playground/sys/vm/vm_radix.c	(revision 247223)
+++ user/attilio/vmc-playground/sys/vm/vm_radix.c	(revision 247224)
@@ -1,778 +1,776 @@
 /*
  * Copyright (c) 2013 EMC Corp.
  * Copyright (c) 2011 Jeffrey Roberson <jeff@freebsd.org>
  * Copyright (c) 2008 Mayur Shardul <mayur.shardul@gmail.com>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
 /*
  * Path-compressed radix trie implementation.
  * The following code is not generalized into a general purpose library
  * because there are way too many parameters embedded that should really
  * be decided by the library consumers.  At the same time, consumers
  * of this code must achieve highest possible performance.
  *
  * The implementation takes into account the following rationale:
  * - Size of the nodes might be as small as possible.
  * - There is no bias toward lookup operations over inserts or removes,
  *   and vice-versa.
  * - In average there are not many complete levels, than level
  *   compression may just complicate things.
  */
 
 #include <sys/cdefs.h>
 
 #include "opt_ddb.h"
 #include "opt_vm.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/vmmeter.h>
 
 #include <vm/uma.h>
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_page.h>
 #include <vm/vm_radix.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 #ifndef VM_RADIX_BOOT_CACHE
-#define	VM_RADIX_BOOT_CACHE	1500
+#define	VM_RADIX_BOOT_CACHE	150
 #endif
 
 /*
  * Such sizes should permit to keep node children contained into a single
  * cache-line, or to at least not span many of those.
  * In particular, sparse tries should however be compressed properly and
  * then make some extra-levels not a big deal.
  */
 #ifdef __LP64__
 #define	VM_RADIX_WIDTH	4
 #else
 #define	VM_RADIX_WIDTH	3
 #endif
 
 #define	VM_RADIX_COUNT	(1 << VM_RADIX_WIDTH)
 #define	VM_RADIX_MASK	(VM_RADIX_COUNT - 1)
 #define	VM_RADIX_LIMIT							\
 	(howmany((sizeof(vm_pindex_t) * NBBY), VM_RADIX_WIDTH) - 1)
 
 /* Flag bits stored in node pointers. */
 #define	VM_RADIX_ISLEAF	0x1
 #define	VM_RADIX_FLAGS	0x1
 #define	VM_RADIX_PAD	VM_RADIX_FLAGS
 
 /* Returns one unit associated with specified level. */
 #define	VM_RADIX_UNITLEVEL(lev)						\
 	((vm_pindex_t)1 << ((VM_RADIX_LIMIT - (lev)) * VM_RADIX_WIDTH))
 
 struct vm_radix_node {
 	void		*rn_child[VM_RADIX_COUNT];	/* Child nodes. */
 	vm_pindex_t	 rn_owner;			/* Owner of record. */
 	uint16_t	 rn_count;			/* Valid children. */
 	uint16_t	 rn_clev;			/* Current level. */
 };
 
 static uma_zone_t vm_radix_node_zone;
 
 /*
  * Boot-time cache of struct vm_radix_node objects.
  * This cache is used to cater page allocations before the UMA zone is
  * actually setup and pre-allocated (ie. pmap_init()).
  */
 static u_int boot_cache_cnt;
 static struct vm_radix_node boot_cache[VM_RADIX_BOOT_CACHE];
 
 static struct vm_radix_node *
 vm_radix_carve_bootcache(void)
 {
 	struct vm_radix_node *rnode;
 
 	if (boot_cache_cnt == VM_RADIX_BOOT_CACHE)
 		panic("%s: Increase VM_RADIX_BOOT_CACHE (%u)", __func__,
 		    VM_RADIX_BOOT_CACHE);
 	rnode = &boot_cache[boot_cache_cnt];
 	boot_cache_cnt++;
 	return (rnode);
 }
 
 /*
  * Allocate a radix node.  Pre-allocation ensures that the request will be
  * always successfully satisfied.
  */
 static __inline struct vm_radix_node *
 vm_radix_node_get(vm_pindex_t owner, uint16_t count, uint16_t clevel)
 {
 	struct vm_radix_node *rnode;
 
 	if (__predict_false(boot_cache_cnt <= VM_RADIX_BOOT_CACHE))
 		rnode = vm_radix_carve_bootcache();
 	else {
 		rnode = uma_zalloc(vm_radix_node_zone, M_NOWAIT | M_ZERO);
 
 		/*
 		 * The required number of nodes might be already correctly
 		 * pre-allocated in vm_radix_init().  However, UMA can reserve
 		 * few nodes on per-cpu specific buckets, which will not be
 		 * accessible from the curcpu.  The allocation could then
 		 * return NULL when the pre-allocation pool is close to be
 		 * exhausted.  Anyway, in practice this should never be a
 		 * problem because a new node is not always required for
 		 * insert, thus the pre-allocation pool should already have
 		 * some extra-pages that indirectly deal with this situation.
 		 */
 		if (rnode == NULL)
 			panic("%s: uma_zalloc() returned NULL for a new node",
 			    __func__);
 	}
 	rnode->rn_owner = owner;
 	rnode->rn_count = count;
 	rnode->rn_clev = clevel;
 	return (rnode);
 }
 
 /*
  * Free radix node.
  */
 static __inline void
 vm_radix_node_put(struct vm_radix_node *rnode)
 {
 
 	if (__predict_false(rnode > boot_cache &&
 	    rnode <= &boot_cache[VM_RADIX_BOOT_CACHE]))
 		return;
 	uma_zfree(vm_radix_node_zone, rnode);
 }
 
 /*
  * Return the position in the array for a given level.
  */
 static __inline int
 vm_radix_slot(vm_pindex_t index, uint16_t level)
 {
 
 	return ((index >> ((VM_RADIX_LIMIT - level) * VM_RADIX_WIDTH)) &
 	    VM_RADIX_MASK);
 }
 
 /* Trims the key after the specified level. */
 static __inline vm_pindex_t
 vm_radix_trimkey(vm_pindex_t index, uint16_t level)
 {
 	vm_pindex_t ret;
 
 	ret = index;
 	if (level < VM_RADIX_LIMIT) {
 		ret >>= (VM_RADIX_LIMIT - level) * VM_RADIX_WIDTH;
 		ret <<= (VM_RADIX_LIMIT - level) * VM_RADIX_WIDTH;
 	}
 	return (ret);
 }
 
 /*
  * Get the root node for a radix tree.
  */
 static __inline struct vm_radix_node *
 vm_radix_getroot(struct vm_radix *rtree)
 {
 
 	return ((struct vm_radix_node *)(rtree->rt_root & ~VM_RADIX_FLAGS));
 }
 
 /*
  * Set the root node for a radix tree.
  */
 static __inline void
 vm_radix_setroot(struct vm_radix *rtree, struct vm_radix_node *rnode)
 {
 
 	rtree->rt_root = (uintptr_t)rnode;
 }
 
 /*
  * Returns the associated page extracted from rnode if available,
  * NULL otherwise.
  */
 static __inline vm_page_t
 vm_radix_node_page(struct vm_radix_node *rnode)
 {
 
 	return ((((uintptr_t)rnode & VM_RADIX_ISLEAF) != 0) ?
 	    (vm_page_t)((uintptr_t)rnode & ~VM_RADIX_FLAGS) : NULL);
 }
 
 /*
  * Adds the page as a child of provided node.
  */
 static __inline void
 vm_radix_addpage(struct vm_radix_node *rnode, vm_pindex_t index, uint16_t clev,
     vm_page_t page)
 {
 	int slot;
 
 	slot = vm_radix_slot(index, clev);
 	rnode->rn_child[slot] = (void *)((uintptr_t)page | VM_RADIX_ISLEAF);
 }
 
 /*
  * Returns the slot where two keys differ.
  * It cannot accept 2 equal keys.
  */
 static __inline uint16_t
 vm_radix_keydiff(vm_pindex_t index1, vm_pindex_t index2)
 {
 	uint16_t clev;
 
 	KASSERT(index1 != index2, ("%s: passing the same key value %jx",
 	    __func__, (uintmax_t)index1));
 
 	index1 ^= index2;
 	for (clev = 0; clev <= VM_RADIX_LIMIT ; clev++)
 		if (vm_radix_slot(index1, clev))
 			return (clev);
 	panic("%s: it might have not reached this point", __func__);
 	return (0);
 }
 
 /*
  * Returns TRUE if it can be determined that key does not belong to the
  * specified rnode. FALSE otherwise.
  */
 static __inline boolean_t
 vm_radix_keybarr(struct vm_radix_node *rnode, vm_pindex_t idx)
 {
 
 	if (rnode->rn_clev > 0) {
 		idx = vm_radix_trimkey(idx, rnode->rn_clev - 1);
 		idx -= rnode->rn_owner;
 		if (idx != 0)
 			return (TRUE);
 	}
 	return (FALSE);
 }
 
 /*
  * Adjusts the idx key to the first upper level available, based on a valid
  * initial level and map of available levels.
  * Returns a value bigger than 0 to signal that there are not valid levels
  * available.
  */
 static __inline int
 vm_radix_addlev(vm_pindex_t *idx, boolean_t *levels, uint16_t ilev)
 {
 	vm_pindex_t wrapidx;
 
 	for (; levels[ilev] == FALSE ||
 	    vm_radix_slot(*idx, ilev) == (VM_RADIX_COUNT - 1); ilev--)
 		if (ilev == 0)
 			break;
 	KASSERT(ilev > 0 || levels[0] == TRUE,
 	    ("%s: levels back-scanning problem", __func__));
 	if (ilev == 0 && vm_radix_slot(*idx, ilev) == (VM_RADIX_COUNT - 1))
 		return (1);
 	wrapidx = *idx;
 	*idx = vm_radix_trimkey(*idx, ilev);
 	*idx += VM_RADIX_UNITLEVEL(ilev);
 	if (*idx < wrapidx)
 		return (1);
 	return (0);
 }
 
 /*
  * Adjusts the idx key to the first lower level available, based on a valid
  * initial level and map of available levels.
  * Returns a value bigger than 0 to signal that there are not valid levels
  * available.
  */
 static __inline int
 vm_radix_declev(vm_pindex_t *idx, boolean_t *levels, uint16_t ilev)
 {
 	vm_pindex_t wrapidx;
 
 	for (; levels[ilev] == FALSE ||
 	    vm_radix_slot(*idx, ilev) == 0; ilev--)
 		if (ilev == 0)
 			break;
 	KASSERT(ilev > 0 || levels[0] == TRUE,
 	    ("%s: levels back-scanning problem", __func__));
 	if (ilev == 0 && vm_radix_slot(*idx, ilev) == 0)
 		return (1);
 	wrapidx = *idx;
 	*idx = vm_radix_trimkey(*idx, ilev);
 	*idx |= VM_RADIX_UNITLEVEL(ilev) - 1;
 	*idx -= VM_RADIX_UNITLEVEL(ilev);
 	if (*idx > wrapidx)
 		return (1);
 	return (0);
 }
 
 /*
  * Internal handwork for vm_radix_reclaim_allonodes() primitive.
  * This function is recrusive.
  */
 static void
 vm_radix_reclaim_allnodes_int(struct vm_radix_node *rnode)
 {
 	int slot;
 
 	for (slot = 0; slot < VM_RADIX_COUNT && rnode->rn_count != 0; slot++) {
 		if (rnode->rn_child[slot] == NULL)
 			continue;
 		if (vm_radix_node_page(rnode->rn_child[slot]) == NULL)
 			vm_radix_reclaim_allnodes_int(rnode->rn_child[slot]);
 		rnode->rn_count--;
 	}
 	vm_radix_node_put(rnode);
 }
 
 #ifdef INVARIANTS
 /*
  * Radix node zone destructor.
  */
 static void
 vm_radix_node_zone_dtor(void *mem, int size __unused, void *arg __unused)
 {
 	struct vm_radix_node *rnode;
 
 	rnode = mem;
 	KASSERT(rnode->rn_count == 0,
 	    ("vm_radix_node_put: Freeing node %p with %d children\n", mem,
 	    rnode->rn_count));
 }
 #endif
 
 /*
  * Pre-allocate intermediate nodes from the UMA slab zone.
  */
 static void
 vm_radix_init(void *arg __unused)
 {
-	int nitems;
 
 	vm_radix_node_zone = uma_zcreate("RADIX NODE",
 	    sizeof(struct vm_radix_node), NULL,
 #ifdef INVARIANTS
 	    vm_radix_node_zone_dtor,
 #else
 	    NULL,
 #endif
 	    NULL, NULL, VM_RADIX_PAD, UMA_ZONE_VM | UMA_ZONE_NOFREE);
-	nitems = uma_zone_set_max(vm_radix_node_zone, cnt.v_page_count);
-	if (nitems < cnt.v_page_count)
-		panic("%s: unexpected requested number of items", __func__);
-	uma_prealloc(vm_radix_node_zone, nitems);
+	if (!uma_zone_reserve_kva(vm_radix_node_zone, cnt.v_page_count))
+		panic("%s: unable to create new zone", __func__);
+	uma_prealloc(vm_radix_node_zone, cnt.v_page_count);
 	boot_cache_cnt = VM_RADIX_BOOT_CACHE + 1;
 }
 SYSINIT(vm_radix_init, SI_SUB_KMEM, SI_ORDER_SECOND, vm_radix_init, NULL);
 
 /*
  * Inserts the key-value pair in to the trie.
  * Panics if the key already exists.
  */
 void
 vm_radix_insert(struct vm_radix *rtree, vm_pindex_t index, vm_page_t page)
 {
 	vm_pindex_t newind;
 	struct vm_radix_node *rnode, *tmp, *tmp2;
 	vm_page_t m;
 	int slot;
 	uint16_t clev;
 
 	/*
 	 * The owner of record for root is not really important because it
 	 * will never be used.
 	 */
 	rnode = vm_radix_getroot(rtree);
 	if (rnode == NULL) {
 		rnode = vm_radix_node_get(0, 1, 0);
 		vm_radix_setroot(rtree, rnode);
 		vm_radix_addpage(rnode, index, 0, page);
 		return;
 	}
 	while (rnode != NULL) {
 		if (vm_radix_keybarr(rnode, index) == TRUE)
 			break;
 		slot = vm_radix_slot(index, rnode->rn_clev);
 		m = vm_radix_node_page(rnode->rn_child[slot]);
 		if (m != NULL) {
 			if (m->pindex == index)
 				panic("%s: key %jx is already present",
 				    __func__, (uintmax_t)index);
 			clev = vm_radix_keydiff(m->pindex, index);
 			tmp = vm_radix_node_get(vm_radix_trimkey(index,
 			    clev - 1), 2, clev);
 			rnode->rn_child[slot] = tmp;
 			vm_radix_addpage(tmp, index, clev, page);
 			vm_radix_addpage(tmp, m->pindex, clev, m);
 			return;
 		}
 		if (rnode->rn_child[slot] == NULL) {
 			rnode->rn_count++;
 			vm_radix_addpage(rnode, index, rnode->rn_clev, page);
 			return;
 		}
 		rnode = rnode->rn_child[slot];
 	}
 	if (rnode == NULL)
 		panic("%s: path traversal ended unexpectedly", __func__);
 
 	/*
 	 * Scan the trie from the top and find the parent to insert
 	 * the new object.
 	 */
 	newind = rnode->rn_owner;
 	clev = vm_radix_keydiff(newind, index);
 	slot = VM_RADIX_COUNT;
 	for (rnode = vm_radix_getroot(rtree); ; rnode = tmp) {
 		KASSERT(rnode != NULL, ("%s: edge cannot be NULL in the scan",
 		    __func__));
 		KASSERT(clev >= rnode->rn_clev,
 		    ("%s: unexpected trie depth: clev: %d, rnode->rn_clev: %d",
 		    __func__, clev, rnode->rn_clev));
 		slot = vm_radix_slot(index, rnode->rn_clev);
 		tmp = rnode->rn_child[slot];
 		KASSERT(tmp != NULL && vm_radix_node_page(tmp) == NULL,
 		    ("%s: unexpected lookup interruption", __func__));
 		if (tmp->rn_clev > clev)
 			break;
 	}
 	KASSERT(rnode != NULL && tmp != NULL && slot < VM_RADIX_COUNT,
 	    ("%s: invalid scan parameters rnode: %p, tmp: %p, slot: %d",
 	    __func__, (void *)rnode, (void *)tmp, slot));
 
 	/*
 	 * A new node is needed because the right insertion level is reached.
 	 * Setup the new intermediate node and add the 2 children: the
 	 * new object and the older edge.
 	 */
 	tmp2 = vm_radix_node_get(vm_radix_trimkey(page->pindex, clev - 1), 2,
 	    clev);
 	rnode->rn_child[slot] = tmp2;
 	vm_radix_addpage(tmp2, index, clev, page);
 	slot = vm_radix_slot(newind, clev);
 	tmp2->rn_child[slot] = tmp;
 }
 
 /*
  * Returns the value stored at the index.  If the index is not present
  * NULL is returned.
  */
 vm_page_t
 vm_radix_lookup(struct vm_radix *rtree, vm_pindex_t index)
 {
 	struct vm_radix_node *rnode;
 	vm_page_t m;
 	int slot;
 
 	rnode = vm_radix_getroot(rtree);
 	while (rnode != NULL) {
 		if (vm_radix_keybarr(rnode, index) == TRUE)
 			return (NULL);
 		slot = vm_radix_slot(index, rnode->rn_clev);
 		rnode = rnode->rn_child[slot];
 		m = vm_radix_node_page(rnode);
 		if (m != NULL) {
 			if (m->pindex == index)
 				return (m);
 			else
 				return (NULL);
 		}
 	}
 	return (NULL);
 }
 
 /*
  * Look up any entry at a position bigger than or equal to index.
  */
 vm_page_t
 vm_radix_lookup_ge(struct vm_radix *rtree, vm_pindex_t index)
 {
 	vm_pindex_t inc;
 	vm_page_t m;
 	struct vm_radix_node *rnode;
 	int slot;
 	uint16_t difflev;
 	boolean_t maplevels[VM_RADIX_LIMIT + 1];
 #ifdef INVARIANTS
 	int loops = 0;
 #endif
 
 restart:
 	KASSERT(++loops < 1000, ("%s: too many loops", __func__));
 	for (difflev = 0; difflev < (VM_RADIX_LIMIT + 1); difflev++)
 		maplevels[difflev] = FALSE;
 	rnode = vm_radix_getroot(rtree);
 	while (rnode != NULL) {
 		maplevels[rnode->rn_clev] = TRUE;
 
 		/*
 		 * If the keys differ before the current bisection node
 		 * the search key might rollback to the earlierst
 		 * available bisection node, or to the smaller value
 		 * in the current domain (if the owner is bigger than the
 		 * search key).
 		 * The search for a valid bisection node is helped through
 		 * the use of maplevels array which should bring immediately
 		 * a lower useful level, skipping holes.
 		 */
 		if (vm_radix_keybarr(rnode, index) == TRUE) {
 			difflev = vm_radix_keydiff(index, rnode->rn_owner);
 			if (index > rnode->rn_owner) {
 				if (vm_radix_addlev(&index, maplevels,
 				    difflev) > 0)
 					break;
 			} else
 				index = vm_radix_trimkey(rnode->rn_owner,
 				    difflev);
 			goto restart;
 		}
 		slot = vm_radix_slot(index, rnode->rn_clev);
 		m = vm_radix_node_page(rnode->rn_child[slot]);
 		if (m != NULL && m->pindex >= index)
 			return (m);
 		if (rnode->rn_child[slot] != NULL && m == NULL) {
 			rnode = rnode->rn_child[slot];
 			continue;
 		}
 
 		/*
 		 * Look for an available edge or page within the current
 		 * bisection node.
 		 */
                 if (slot < (VM_RADIX_COUNT - 1)) {
 			inc = VM_RADIX_UNITLEVEL(rnode->rn_clev);
 			index = vm_radix_trimkey(index, rnode->rn_clev);
 			index += inc;
 			slot++;
 			for (;; index += inc, slot++) {
 				m = vm_radix_node_page(rnode->rn_child[slot]);
 				if (m != NULL && m->pindex >= index)
 					return (m);
 				if ((rnode->rn_child[slot] != NULL &&
 				    m == NULL) || slot == (VM_RADIX_COUNT - 1))
 					break;
 			}
 		}
 
 		/*
 		 * If a valid page or edge, bigger than the search slot, is
 		 * found in the traversal, skip to the next higher-level key.
 		 */
 		if (slot == (VM_RADIX_COUNT - 1) &&
 		    (rnode->rn_child[slot] == NULL || m != NULL)) {
 			if (rnode->rn_clev == 0  || vm_radix_addlev(&index,
 			    maplevels, rnode->rn_clev - 1) > 0)
 				break;
 			goto restart;
 		}
 		rnode = rnode->rn_child[slot];
 	}
 	return (NULL);
 }
 
 /*
  * Look up any entry at a position less than or equal to index.
  */
 vm_page_t
 vm_radix_lookup_le(struct vm_radix *rtree, vm_pindex_t index)
 {
 	vm_pindex_t inc;
 	vm_page_t m;
 	struct vm_radix_node *rnode;
 	int slot;
 	uint16_t difflev;
 	boolean_t maplevels[VM_RADIX_LIMIT + 1];
 #ifdef INVARIANTS
 	int loops = 0;
 #endif
 
 restart:
 	KASSERT(++loops < 1000, ("%s: too many loops", __func__));
 	for (difflev = 0; difflev < (VM_RADIX_LIMIT + 1); difflev++)
 		maplevels[difflev] = FALSE;
 	rnode = vm_radix_getroot(rtree);
 	while (rnode != NULL) {
 		maplevels[rnode->rn_clev] = TRUE;
 
 		/*
 		 * If the keys differ before the current bisection node
 		 * the search key might rollback to the earlierst
 		 * available bisection node, or to the higher value
 		 * in the current domain (if the owner is smaller than the
 		 * search key).
 		 * The search for a valid bisection node is helped through
 		 * the use of maplevels array which should bring immediately
 		 * a lower useful level, skipping holes.
 		 */
 		if (vm_radix_keybarr(rnode, index) == TRUE) {
 			difflev = vm_radix_keydiff(index, rnode->rn_owner);
 			if (index > rnode->rn_owner) {
 				index = vm_radix_trimkey(rnode->rn_owner,
 				    difflev);
 				index |= VM_RADIX_UNITLEVEL(difflev) - 1;
 			} else if (vm_radix_declev(&index, maplevels,
 			    difflev) > 0)
 				break;
 			goto restart;
 		}
 		slot = vm_radix_slot(index, rnode->rn_clev);
 		m = vm_radix_node_page(rnode->rn_child[slot]);
 		if (m != NULL && m->pindex <= index)
 			return (m);
 		if (rnode->rn_child[slot] != NULL && m == NULL) {
 			rnode = rnode->rn_child[slot];
 			continue;
 		}
 
 		/*
 		 * Look for an available edge or page within the current
 		 * bisection node.
 		 */
 		if (slot > 0) {
 			inc = VM_RADIX_UNITLEVEL(rnode->rn_clev);
 			index = vm_radix_trimkey(index, rnode->rn_clev);
 			index |= inc - 1;
 			index -= inc;
 			slot--;
 			for (;; index -= inc, slot--) {
 				m = vm_radix_node_page(rnode->rn_child[slot]);
 				if (m != NULL && m->pindex <= index)
 					return (m);
 				if ((rnode->rn_child[slot] != NULL &&
 				    m == NULL) || slot == 0)
 					break;
 			}
 		}
 
 		/*
 		 * If a valid page or edge, smaller than the search slot, is
 		 * found in the traversal, skip to the next higher-level key.
 		 */
 		if (slot == 0 && (rnode->rn_child[slot] == NULL || m != NULL)) {
 			if (rnode->rn_clev == 0 || vm_radix_declev(&index,
 			    maplevels, rnode->rn_clev - 1) > 0)
 				break;
 			goto restart;
 		}
 		rnode = rnode->rn_child[slot];
 	}
 	return (NULL);
 }
 
 /*
  * Remove the specified index from the tree.
  * Panics if the key is not present.
  */
 void
 vm_radix_remove(struct vm_radix *rtree, vm_pindex_t index)
 {
 	struct vm_radix_node *rnode, *parent;
 	vm_page_t m;
 	int i, slot;
 
 	parent = NULL;
 	rnode = vm_radix_getroot(rtree);
 	for (;;) {
 		if (rnode == NULL)
 			panic("vm_radix_remove: impossible to locate the key");
 		slot = vm_radix_slot(index, rnode->rn_clev);
 		m = vm_radix_node_page(rnode->rn_child[slot]);
 		if (m != NULL && m->pindex == index) {
 			rnode->rn_child[slot] = NULL;
 			rnode->rn_count--;
 			if (rnode->rn_count > 1)
 				break;
 			if (parent == NULL) {
 				if (rnode->rn_count == 0) {
 					vm_radix_node_put(rnode);
 					vm_radix_setroot(rtree, NULL);
 				}
 				break;
 			}
 			for (i = 0; i < VM_RADIX_COUNT; i++)
 				if (rnode->rn_child[i] != NULL)
 					break;
 			KASSERT(i != VM_RADIX_COUNT,
 			    ("%s: invalid node configuration", __func__));
 			slot = vm_radix_slot(index, parent->rn_clev);
 			KASSERT(parent->rn_child[slot] == rnode,
 			    ("%s: invalid child value", __func__));
 			parent->rn_child[slot] = rnode->rn_child[i];
 			rnode->rn_count--;
 			rnode->rn_child[i] = NULL;
 			vm_radix_node_put(rnode);
 			break;
 		}
 		if (m != NULL && m->pindex != index)
 			panic("%s: invalid key found", __func__);
 		parent = rnode;
 		rnode = rnode->rn_child[slot];
 	}
 }
 
 /*
  * Remove and free all the nodes from the radix tree.
  * This function is recrusive but there is a tight control on it as the
  * maximum depth of the tree is fixed.
  */
 void
 vm_radix_reclaim_allnodes(struct vm_radix *rtree)
 {
 	struct vm_radix_node *root;
 
 	root = vm_radix_getroot(rtree);
 	if (root == NULL)
 		return;
 	vm_radix_reclaim_allnodes_int(root);
 	vm_radix_setroot(rtree, NULL);
 }
 
 #ifdef DDB
 /*
  * Show details about the given radix node.
  */
 DB_SHOW_COMMAND(radixnode, db_show_radixnode)
 {
 	struct vm_radix_node *rnode;
 	int i;
 
         if (!have_addr)
                 return;
 	rnode = (struct vm_radix_node *)addr;
 	db_printf("radixnode %p, owner %jx, children count %u, level %u:\n",
 	    (void *)rnode, (uintmax_t)rnode->rn_owner, rnode->rn_count,
 	    rnode->rn_clev);
 	for (i = 0; i < VM_RADIX_COUNT; i++)
 		if (rnode->rn_child[i] != NULL)
 			db_printf("slot: %d, val: %p, page: %p, clev: %d\n",
 			    i, (void *)rnode->rn_child[i],
 			    (void *)vm_radix_node_page(rnode->rn_child[i]),
 			    rnode->rn_clev);
 }
 #endif /* DDB */