Index: head/sys/dev/md/md.c
===================================================================
--- head/sys/dev/md/md.c	(revision 362612)
+++ head/sys/dev/md/md.c	(revision 362613)
@@ -1,2174 +1,2172 @@
 /*-
  * SPDX-License-Identifier: (Beerware AND BSD-3-Clause)
  *
  * ----------------------------------------------------------------------------
  * "THE BEER-WARE LICENSE" (Revision 42):
  * <phk@FreeBSD.ORG> wrote this file.  As long as you retain this notice you
  * can do whatever you want with this stuff. If we meet some day, and you think
  * this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
  * ----------------------------------------------------------------------------
  *
  * $FreeBSD$
  *
  */
 
 /*-
  * The following functions are based in the vn(4) driver: mdstart_swap(),
  * mdstart_vnode(), mdcreate_swap(), mdcreate_vnode() and mddestroy(),
  * and as such under the following copyright:
  *
  * Copyright (c) 1988 University of Utah.
  * Copyright (c) 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
  * Copyright (c) 2013 The FreeBSD Foundation
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the Systems Programming Group of the University of Utah Computer
  * Science Department.
  *
  * Portions of this software were developed by Konstantin Belousov
  * under sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * from: Utah Hdr: vn.c 1.13 94/04/02
  *
  *	from: @(#)vn.c	8.6 (Berkeley) 4/1/94
  * From: src/sys/dev/vn/vn.c,v 1.122 2000/12/16 16:06:03
  */
 
 #include "opt_rootdevname.h"
 #include "opt_geom.h"
 #include "opt_md.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/conf.h>
 #include <sys/devicestat.h>
 #include <sys/fcntl.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/limits.h>
 #include <sys/linker.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mdioctl.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/sx.h>
 #include <sys/namei.h>
 #include <sys/proc.h>
 #include <sys/queue.h>
 #include <sys/rwlock.h>
 #include <sys/sbuf.h>
 #include <sys/sched.h>
 #include <sys/sf_buf.h>
 #include <sys/sysctl.h>
 #include <sys/uio.h>
 #include <sys/vnode.h>
 #include <sys/disk.h>
 
 #include <geom/geom.h>
 #include <geom/geom_int.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 #include <vm/swap_pager.h>
 #include <vm/uma.h>
 
 #include <machine/bus.h>
 
 #define MD_MODVER 1
 
 #define MD_SHUTDOWN	0x10000		/* Tell worker thread to terminate. */
 #define	MD_EXITING	0x20000		/* Worker thread is exiting. */
 #define MD_PROVIDERGONE	0x40000		/* Safe to free the softc */
 
 #ifndef MD_NSECT
 #define MD_NSECT (10000 * 2)
 #endif
 
 struct md_req {
 	unsigned	md_unit;	/* unit number */
 	enum md_types	md_type;	/* type of disk */
 	off_t		md_mediasize;	/* size of disk in bytes */
 	unsigned	md_sectorsize;	/* sectorsize */
 	unsigned	md_options;	/* options */
 	int		md_fwheads;	/* firmware heads */
 	int		md_fwsectors;	/* firmware sectors */
 	char		*md_file;	/* pathname of file to mount */
 	enum uio_seg	md_file_seg;	/* location of md_file */
 	char		*md_label;	/* label of the device (userspace) */
 	int		*md_units;	/* pointer to units array (kernel) */
 	size_t		md_units_nitems; /* items in md_units array */
 };
 
 #ifdef COMPAT_FREEBSD32
 struct md_ioctl32 {
 	unsigned	md_version;
 	unsigned	md_unit;
 	enum md_types	md_type;
 	uint32_t	md_file;
 	off_t		md_mediasize;
 	unsigned	md_sectorsize;
 	unsigned	md_options;
 	uint64_t	md_base;
 	int		md_fwheads;
 	int		md_fwsectors;
 	uint32_t	md_label;
 	int		md_pad[MDNPAD];
 } __attribute__((__packed__));
 CTASSERT((sizeof(struct md_ioctl32)) == 436);
 
 #define	MDIOCATTACH_32	_IOC_NEWTYPE(MDIOCATTACH, struct md_ioctl32)
 #define	MDIOCDETACH_32	_IOC_NEWTYPE(MDIOCDETACH, struct md_ioctl32)
 #define	MDIOCQUERY_32	_IOC_NEWTYPE(MDIOCQUERY, struct md_ioctl32)
 #define	MDIOCRESIZE_32	_IOC_NEWTYPE(MDIOCRESIZE, struct md_ioctl32)
 #endif /* COMPAT_FREEBSD32 */
 
 static MALLOC_DEFINE(M_MD, "md_disk", "Memory Disk");
 static MALLOC_DEFINE(M_MDSECT, "md_sectors", "Memory Disk Sectors");
 
 static int md_debug;
 SYSCTL_INT(_debug, OID_AUTO, mddebug, CTLFLAG_RW, &md_debug, 0,
     "Enable md(4) debug messages");
 static int md_malloc_wait;
 SYSCTL_INT(_vm, OID_AUTO, md_malloc_wait, CTLFLAG_RW, &md_malloc_wait, 0,
     "Allow malloc to wait for memory allocations");
 
 #if defined(MD_ROOT) && !defined(MD_ROOT_FSTYPE)
 #define	MD_ROOT_FSTYPE	"ufs"
 #endif
 
 #if defined(MD_ROOT)
 /*
  * Preloaded image gets put here.
  */
 #if defined(MD_ROOT_SIZE)
 /*
  * We put the mfs_root symbol into the oldmfs section of the kernel object file.
  * Applications that patch the object with the image can determine
  * the size looking at the oldmfs section size within the kernel.
  */
 u_char mfs_root[MD_ROOT_SIZE*1024] __attribute__ ((section ("oldmfs")));
 const int mfs_root_size = sizeof(mfs_root);
 #elif defined(MD_ROOT_MEM)
 /* MD region already mapped in the memory */
 u_char *mfs_root;
 int mfs_root_size;
 #else
 extern volatile u_char __weak_symbol mfs_root;
 extern volatile u_char __weak_symbol mfs_root_end;
 __GLOBL(mfs_root);
 __GLOBL(mfs_root_end);
 #define mfs_root_size ((uintptr_t)(&mfs_root_end - &mfs_root))
 #endif
 #endif
 
 static g_init_t g_md_init;
 static g_fini_t g_md_fini;
 static g_start_t g_md_start;
 static g_access_t g_md_access;
 static void g_md_dumpconf(struct sbuf *sb, const char *indent,
     struct g_geom *gp, struct g_consumer *cp __unused, struct g_provider *pp);
 static g_provgone_t g_md_providergone;
 
 static struct cdev *status_dev = NULL;
 static struct sx md_sx;
 static struct unrhdr *md_uh;
 
 static d_ioctl_t mdctlioctl;
 
 static struct cdevsw mdctl_cdevsw = {
 	.d_version =	D_VERSION,
 	.d_ioctl =	mdctlioctl,
 	.d_name =	MD_NAME,
 };
 
 struct g_class g_md_class = {
 	.name = "MD",
 	.version = G_VERSION,
 	.init = g_md_init,
 	.fini = g_md_fini,
 	.start = g_md_start,
 	.access = g_md_access,
 	.dumpconf = g_md_dumpconf,
 	.providergone = g_md_providergone,
 };
 
 DECLARE_GEOM_CLASS(g_md_class, g_md);
 
 
 static LIST_HEAD(, md_s) md_softc_list = LIST_HEAD_INITIALIZER(md_softc_list);
 
 #define NINDIR	(PAGE_SIZE / sizeof(uintptr_t))
 #define NMASK	(NINDIR-1)
 static int nshift;
 
 static uma_zone_t md_pbuf_zone;
 
 struct indir {
 	uintptr_t	*array;
 	u_int		total;
 	u_int		used;
 	u_int		shift;
 };
 
 struct md_s {
 	int unit;
 	LIST_ENTRY(md_s) list;
 	struct bio_queue_head bio_queue;
 	struct mtx queue_mtx;
 	struct cdev *dev;
 	enum md_types type;
 	off_t mediasize;
 	unsigned sectorsize;
 	unsigned opencount;
 	unsigned fwheads;
 	unsigned fwsectors;
 	char ident[32];
 	unsigned flags;
 	char name[20];
 	struct proc *procp;
 	struct g_geom *gp;
 	struct g_provider *pp;
 	int (*start)(struct md_s *sc, struct bio *bp);
 	struct devstat *devstat;
 
 	/* MD_MALLOC related fields */
 	struct indir *indir;
 	uma_zone_t uma;
 
 	/* MD_PRELOAD related fields */
 	u_char *pl_ptr;
 	size_t pl_len;
 
 	/* MD_VNODE related fields */
 	struct vnode *vnode;
 	char file[PATH_MAX];
 	char label[PATH_MAX];
 	struct ucred *cred;
 
 	/* MD_SWAP related fields */
 	vm_object_t object;
 };
 
 static struct indir *
 new_indir(u_int shift)
 {
 	struct indir *ip;
 
 	ip = malloc(sizeof *ip, M_MD, (md_malloc_wait ? M_WAITOK : M_NOWAIT)
 	    | M_ZERO);
 	if (ip == NULL)
 		return (NULL);
 	ip->array = malloc(sizeof(uintptr_t) * NINDIR,
 	    M_MDSECT, (md_malloc_wait ? M_WAITOK : M_NOWAIT) | M_ZERO);
 	if (ip->array == NULL) {
 		free(ip, M_MD);
 		return (NULL);
 	}
 	ip->total = NINDIR;
 	ip->shift = shift;
 	return (ip);
 }
 
 static void
 del_indir(struct indir *ip)
 {
 
 	free(ip->array, M_MDSECT);
 	free(ip, M_MD);
 }
 
 static void
 destroy_indir(struct md_s *sc, struct indir *ip)
 {
 	int i;
 
 	for (i = 0; i < NINDIR; i++) {
 		if (!ip->array[i])
 			continue;
 		if (ip->shift)
 			destroy_indir(sc, (struct indir*)(ip->array[i]));
 		else if (ip->array[i] > 255)
 			uma_zfree(sc->uma, (void *)(ip->array[i]));
 	}
 	del_indir(ip);
 }
 
 /*
  * This function does the math and allocates the top level "indir" structure
  * for a device of "size" sectors.
  */
 
 static struct indir *
 dimension(off_t size)
 {
 	off_t rcnt;
 	struct indir *ip;
 	int layer;
 
 	rcnt = size;
 	layer = 0;
 	while (rcnt > NINDIR) {
 		rcnt /= NINDIR;
 		layer++;
 	}
 
 	/*
 	 * XXX: the top layer is probably not fully populated, so we allocate
 	 * too much space for ip->array in here.
 	 */
 	ip = malloc(sizeof *ip, M_MD, M_WAITOK | M_ZERO);
 	ip->array = malloc(sizeof(uintptr_t) * NINDIR,
 	    M_MDSECT, M_WAITOK | M_ZERO);
 	ip->total = NINDIR;
 	ip->shift = layer * nshift;
 	return (ip);
 }
 
 /*
  * Read a given sector
  */
 
 static uintptr_t
 s_read(struct indir *ip, off_t offset)
 {
 	struct indir *cip;
 	int idx;
 	uintptr_t up;
 
 	if (md_debug > 1)
 		printf("s_read(%jd)\n", (intmax_t)offset);
 	up = 0;
 	for (cip = ip; cip != NULL;) {
 		if (cip->shift) {
 			idx = (offset >> cip->shift) & NMASK;
 			up = cip->array[idx];
 			cip = (struct indir *)up;
 			continue;
 		}
 		idx = offset & NMASK;
 		return (cip->array[idx]);
 	}
 	return (0);
 }
 
 /*
  * Write a given sector, prune the tree if the value is 0
  */
 
 static int
 s_write(struct indir *ip, off_t offset, uintptr_t ptr)
 {
 	struct indir *cip, *lip[10];
 	int idx, li;
 	uintptr_t up;
 
 	if (md_debug > 1)
 		printf("s_write(%jd, %p)\n", (intmax_t)offset, (void *)ptr);
 	up = 0;
 	li = 0;
 	cip = ip;
 	for (;;) {
 		lip[li++] = cip;
 		if (cip->shift) {
 			idx = (offset >> cip->shift) & NMASK;
 			up = cip->array[idx];
 			if (up != 0) {
 				cip = (struct indir *)up;
 				continue;
 			}
 			/* Allocate branch */
 			cip->array[idx] =
 			    (uintptr_t)new_indir(cip->shift - nshift);
 			if (cip->array[idx] == 0)
 				return (ENOSPC);
 			cip->used++;
 			up = cip->array[idx];
 			cip = (struct indir *)up;
 			continue;
 		}
 		/* leafnode */
 		idx = offset & NMASK;
 		up = cip->array[idx];
 		if (up != 0)
 			cip->used--;
 		cip->array[idx] = ptr;
 		if (ptr != 0)
 			cip->used++;
 		break;
 	}
 	if (cip->used != 0 || li == 1)
 		return (0);
 	li--;
 	while (cip->used == 0 && cip != ip) {
 		li--;
 		idx = (offset >> lip[li]->shift) & NMASK;
 		up = lip[li]->array[idx];
 		KASSERT(up == (uintptr_t)cip, ("md screwed up"));
 		del_indir(cip);
 		lip[li]->array[idx] = 0;
 		lip[li]->used--;
 		cip = lip[li];
 	}
 	return (0);
 }
 
 
 static int
 g_md_access(struct g_provider *pp, int r, int w, int e)
 {
 	struct md_s *sc;
 
 	sc = pp->geom->softc;
 	if (sc == NULL) {
 		if (r <= 0 && w <= 0 && e <= 0)
 			return (0);
 		return (ENXIO);
 	}
 	r += pp->acr;
 	w += pp->acw;
 	e += pp->ace;
 	if ((sc->flags & MD_READONLY) != 0 && w > 0)
 		return (EROFS);
 	if ((pp->acr + pp->acw + pp->ace) == 0 && (r + w + e) > 0) {
 		sc->opencount = 1;
 	} else if ((pp->acr + pp->acw + pp->ace) > 0 && (r + w + e) == 0) {
 		sc->opencount = 0;
 	}
 	return (0);
 }
 
 static void
 g_md_start(struct bio *bp)
 {
 	struct md_s *sc;
 
 	sc = bp->bio_to->geom->softc;
 	if ((bp->bio_cmd == BIO_READ) || (bp->bio_cmd == BIO_WRITE)) {
 		devstat_start_transaction_bio(sc->devstat, bp);
 	}
 	mtx_lock(&sc->queue_mtx);
 	bioq_disksort(&sc->bio_queue, bp);
 	wakeup(sc);
 	mtx_unlock(&sc->queue_mtx);
 }
 
 #define	MD_MALLOC_MOVE_ZERO	1
 #define	MD_MALLOC_MOVE_FILL	2
 #define	MD_MALLOC_MOVE_READ	3
 #define	MD_MALLOC_MOVE_WRITE	4
 #define	MD_MALLOC_MOVE_CMP	5
 
 static int
 md_malloc_move_ma(vm_page_t **mp, int *ma_offs, unsigned sectorsize,
     void *ptr, u_char fill, int op)
 {
 	struct sf_buf *sf;
 	vm_page_t m, *mp1;
 	char *p, first;
 	off_t *uc;
 	unsigned n;
 	int error, i, ma_offs1, sz, first_read;
 
 	m = NULL;
 	error = 0;
 	sf = NULL;
 	/* if (op == MD_MALLOC_MOVE_CMP) { gcc */
 		first = 0;
 		first_read = 0;
 		uc = ptr;
 		mp1 = *mp;
 		ma_offs1 = *ma_offs;
 	/* } */
 	sched_pin();
 	for (n = sectorsize; n != 0; n -= sz) {
 		sz = imin(PAGE_SIZE - *ma_offs, n);
 		if (m != **mp) {
 			if (sf != NULL)
 				sf_buf_free(sf);
 			m = **mp;
 			sf = sf_buf_alloc(m, SFB_CPUPRIVATE |
 			    (md_malloc_wait ? 0 : SFB_NOWAIT));
 			if (sf == NULL) {
 				error = ENOMEM;
 				break;
 			}
 		}
 		p = (char *)sf_buf_kva(sf) + *ma_offs;
 		switch (op) {
 		case MD_MALLOC_MOVE_ZERO:
 			bzero(p, sz);
 			break;
 		case MD_MALLOC_MOVE_FILL:
 			memset(p, fill, sz);
 			break;
 		case MD_MALLOC_MOVE_READ:
 			bcopy(ptr, p, sz);
 			cpu_flush_dcache(p, sz);
 			break;
 		case MD_MALLOC_MOVE_WRITE:
 			bcopy(p, ptr, sz);
 			break;
 		case MD_MALLOC_MOVE_CMP:
 			for (i = 0; i < sz; i++, p++) {
 				if (!first_read) {
 					*uc = (u_char)*p;
 					first = *p;
 					first_read = 1;
 				} else if (*p != first) {
 					error = EDOOFUS;
 					break;
 				}
 			}
 			break;
 		default:
 			KASSERT(0, ("md_malloc_move_ma unknown op %d\n", op));
 			break;
 		}
 		if (error != 0)
 			break;
 		*ma_offs += sz;
 		*ma_offs %= PAGE_SIZE;
 		if (*ma_offs == 0)
 			(*mp)++;
 		ptr = (char *)ptr + sz;
 	}
 
 	if (sf != NULL)
 		sf_buf_free(sf);
 	sched_unpin();
 	if (op == MD_MALLOC_MOVE_CMP && error != 0) {
 		*mp = mp1;
 		*ma_offs = ma_offs1;
 	}
 	return (error);
 }
 
 static int
 md_malloc_move_vlist(bus_dma_segment_t **pvlist, int *pma_offs,
     unsigned len, void *ptr, u_char fill, int op)
 {
 	bus_dma_segment_t *vlist;
 	uint8_t *p, *end, first;
 	off_t *uc;
 	int ma_offs, seg_len;
 
 	vlist = *pvlist;
 	ma_offs = *pma_offs;
 	uc = ptr;
 
 	for (; len != 0; len -= seg_len) {
 		seg_len = imin(vlist->ds_len - ma_offs, len);
 		p = (uint8_t *)(uintptr_t)vlist->ds_addr + ma_offs;
 		switch (op) {
 		case MD_MALLOC_MOVE_ZERO:
 			bzero(p, seg_len);
 			break;
 		case MD_MALLOC_MOVE_FILL:
 			memset(p, fill, seg_len);
 			break;
 		case MD_MALLOC_MOVE_READ:
 			bcopy(ptr, p, seg_len);
 			cpu_flush_dcache(p, seg_len);
 			break;
 		case MD_MALLOC_MOVE_WRITE:
 			bcopy(p, ptr, seg_len);
 			break;
 		case MD_MALLOC_MOVE_CMP:
 			end = p + seg_len;
 			first = *uc = *p;
 			/* Confirm all following bytes match the first */
 			while (++p < end) {
 				if (*p != first)
 					return (EDOOFUS);
 			}
 			break;
 		default:
 			KASSERT(0, ("md_malloc_move_vlist unknown op %d\n", op));
 			break;
 		}
 
 		ma_offs += seg_len;
 		if (ma_offs == vlist->ds_len) {
 			ma_offs = 0;
 			vlist++;
 		}
 		ptr = (uint8_t *)ptr + seg_len;
 	}
 	*pvlist = vlist;
 	*pma_offs = ma_offs;
 
 	return (0);
 }
 
 static int
 mdstart_malloc(struct md_s *sc, struct bio *bp)
 {
 	u_char *dst;
 	vm_page_t *m;
 	bus_dma_segment_t *vlist;
 	int i, error, error1, ma_offs, notmapped;
 	off_t secno, nsec, uc;
 	uintptr_t sp, osp;
 
 	switch (bp->bio_cmd) {
 	case BIO_READ:
 	case BIO_WRITE:
 	case BIO_DELETE:
 		break;
 	default:
 		return (EOPNOTSUPP);
 	}
 
 	notmapped = (bp->bio_flags & BIO_UNMAPPED) != 0;
 	vlist = (bp->bio_flags & BIO_VLIST) != 0 ?
 	    (bus_dma_segment_t *)bp->bio_data : NULL;
 	if (notmapped) {
 		m = bp->bio_ma;
 		ma_offs = bp->bio_ma_offset;
 		dst = NULL;
 		KASSERT(vlist == NULL, ("vlists cannot be unmapped"));
 	} else if (vlist != NULL) {
 		ma_offs = bp->bio_ma_offset;
 		dst = NULL;
 	} else {
 		dst = bp->bio_data;
 	}
 
 	nsec = bp->bio_length / sc->sectorsize;
 	secno = bp->bio_offset / sc->sectorsize;
 	error = 0;
 	while (nsec--) {
 		osp = s_read(sc->indir, secno);
 		if (bp->bio_cmd == BIO_DELETE) {
 			if (osp != 0)
 				error = s_write(sc->indir, secno, 0);
 		} else if (bp->bio_cmd == BIO_READ) {
 			if (osp == 0) {
 				if (notmapped) {
 					error = md_malloc_move_ma(&m, &ma_offs,
 					    sc->sectorsize, NULL, 0,
 					    MD_MALLOC_MOVE_ZERO);
 				} else if (vlist != NULL) {
 					error = md_malloc_move_vlist(&vlist,
 					    &ma_offs, sc->sectorsize, NULL, 0,
 					    MD_MALLOC_MOVE_ZERO);
 				} else
 					bzero(dst, sc->sectorsize);
 			} else if (osp <= 255) {
 				if (notmapped) {
 					error = md_malloc_move_ma(&m, &ma_offs,
 					    sc->sectorsize, NULL, osp,
 					    MD_MALLOC_MOVE_FILL);
 				} else if (vlist != NULL) {
 					error = md_malloc_move_vlist(&vlist,
 					    &ma_offs, sc->sectorsize, NULL, osp,
 					    MD_MALLOC_MOVE_FILL);
 				} else
 					memset(dst, osp, sc->sectorsize);
 			} else {
 				if (notmapped) {
 					error = md_malloc_move_ma(&m, &ma_offs,
 					    sc->sectorsize, (void *)osp, 0,
 					    MD_MALLOC_MOVE_READ);
 				} else if (vlist != NULL) {
 					error = md_malloc_move_vlist(&vlist,
 					    &ma_offs, sc->sectorsize,
 					    (void *)osp, 0,
 					    MD_MALLOC_MOVE_READ);
 				} else {
 					bcopy((void *)osp, dst, sc->sectorsize);
 					cpu_flush_dcache(dst, sc->sectorsize);
 				}
 			}
 			osp = 0;
 		} else if (bp->bio_cmd == BIO_WRITE) {
 			if (sc->flags & MD_COMPRESS) {
 				if (notmapped) {
 					error1 = md_malloc_move_ma(&m, &ma_offs,
 					    sc->sectorsize, &uc, 0,
 					    MD_MALLOC_MOVE_CMP);
 					i = error1 == 0 ? sc->sectorsize : 0;
 				} else if (vlist != NULL) {
 					error1 = md_malloc_move_vlist(&vlist,
 					    &ma_offs, sc->sectorsize, &uc, 0,
 					    MD_MALLOC_MOVE_CMP);
 					i = error1 == 0 ? sc->sectorsize : 0;
 				} else {
 					uc = dst[0];
 					for (i = 1; i < sc->sectorsize; i++) {
 						if (dst[i] != uc)
 							break;
 					}
 				}
 			} else {
 				i = 0;
 				uc = 0;
 			}
 			if (i == sc->sectorsize) {
 				if (osp != uc)
 					error = s_write(sc->indir, secno, uc);
 			} else {
 				if (osp <= 255) {
 					sp = (uintptr_t)uma_zalloc(sc->uma,
 					    md_malloc_wait ? M_WAITOK :
 					    M_NOWAIT);
 					if (sp == 0) {
 						error = ENOSPC;
 						break;
 					}
 					if (notmapped) {
 						error = md_malloc_move_ma(&m,
 						    &ma_offs, sc->sectorsize,
 						    (void *)sp, 0,
 						    MD_MALLOC_MOVE_WRITE);
 					} else if (vlist != NULL) {
 						error = md_malloc_move_vlist(
 						    &vlist, &ma_offs,
 						    sc->sectorsize, (void *)sp,
 						    0, MD_MALLOC_MOVE_WRITE);
 					} else {
 						bcopy(dst, (void *)sp,
 						    sc->sectorsize);
 					}
 					error = s_write(sc->indir, secno, sp);
 				} else {
 					if (notmapped) {
 						error = md_malloc_move_ma(&m,
 						    &ma_offs, sc->sectorsize,
 						    (void *)osp, 0,
 						    MD_MALLOC_MOVE_WRITE);
 					} else if (vlist != NULL) {
 						error = md_malloc_move_vlist(
 						    &vlist, &ma_offs,
 						    sc->sectorsize, (void *)osp,
 						    0, MD_MALLOC_MOVE_WRITE);
 					} else {
 						bcopy(dst, (void *)osp,
 						    sc->sectorsize);
 					}
 					osp = 0;
 				}
 			}
 		} else {
 			error = EOPNOTSUPP;
 		}
 		if (osp > 255)
 			uma_zfree(sc->uma, (void*)osp);
 		if (error != 0)
 			break;
 		secno++;
 		if (!notmapped && vlist == NULL)
 			dst += sc->sectorsize;
 	}
 	bp->bio_resid = 0;
 	return (error);
 }
 
 static void
 mdcopyto_vlist(void *src, bus_dma_segment_t *vlist, off_t offset, off_t len)
 {
 	off_t seg_len;
 
 	while (offset >= vlist->ds_len) {
 		offset -= vlist->ds_len;
 		vlist++;
 	}
 
 	while (len != 0) {
 		seg_len = omin(len, vlist->ds_len - offset);
 		bcopy(src, (void *)(uintptr_t)(vlist->ds_addr + offset),
 		    seg_len);
 		offset = 0;
 		src = (uint8_t *)src + seg_len;
 		len -= seg_len;
 		vlist++;
 	}
 }
 
 static void
 mdcopyfrom_vlist(bus_dma_segment_t *vlist, off_t offset, void *dst, off_t len)
 {
 	off_t seg_len;
 
 	while (offset >= vlist->ds_len) {
 		offset -= vlist->ds_len;
 		vlist++;
 	}
 
 	while (len != 0) {
 		seg_len = omin(len, vlist->ds_len - offset);
 		bcopy((void *)(uintptr_t)(vlist->ds_addr + offset), dst,
 		    seg_len);
 		offset = 0;
 		dst = (uint8_t *)dst + seg_len;
 		len -= seg_len;
 		vlist++;
 	}
 }
 
 static int
 mdstart_preload(struct md_s *sc, struct bio *bp)
 {
 	uint8_t *p;
 
 	p = sc->pl_ptr + bp->bio_offset;
 	switch (bp->bio_cmd) {
 	case BIO_READ:
 		if ((bp->bio_flags & BIO_VLIST) != 0) {
 			mdcopyto_vlist(p, (bus_dma_segment_t *)bp->bio_data,
 			    bp->bio_ma_offset, bp->bio_length);
 		} else {
 			bcopy(p, bp->bio_data, bp->bio_length);
 		}
 		cpu_flush_dcache(bp->bio_data, bp->bio_length);
 		break;
 	case BIO_WRITE:
 		if ((bp->bio_flags & BIO_VLIST) != 0) {
 			mdcopyfrom_vlist((bus_dma_segment_t *)bp->bio_data,
 			    bp->bio_ma_offset, p, bp->bio_length);
 		} else {
 			bcopy(bp->bio_data, p, bp->bio_length);
 		}
 		break;
 	}
 	bp->bio_resid = 0;
 	return (0);
 }
 
 static int
 mdstart_vnode(struct md_s *sc, struct bio *bp)
 {
 	int error;
 	struct uio auio;
 	struct iovec aiov;
 	struct iovec *piov;
 	struct mount *mp;
 	struct vnode *vp;
 	struct buf *pb;
 	bus_dma_segment_t *vlist;
 	struct thread *td;
 	off_t iolen, iostart, len, zerosize;
 	int ma_offs, npages;
 
 	switch (bp->bio_cmd) {
 	case BIO_READ:
 		auio.uio_rw = UIO_READ;
 		break;
 	case BIO_WRITE:
 	case BIO_DELETE:
 		auio.uio_rw = UIO_WRITE;
 		break;
 	case BIO_FLUSH:
 		break;
 	default:
 		return (EOPNOTSUPP);
 	}
 
 	td = curthread;
 	vp = sc->vnode;
 	pb = NULL;
 	piov = NULL;
 	ma_offs = bp->bio_ma_offset;
 	len = bp->bio_length;
 
 	/*
 	 * VNODE I/O
 	 *
 	 * If an error occurs, we set BIO_ERROR but we do not set
 	 * B_INVAL because (for a write anyway), the buffer is
 	 * still valid.
 	 */
 
 	if (bp->bio_cmd == BIO_FLUSH) {
 		(void) vn_start_write(vp, &mp, V_WAIT);
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 		error = VOP_FSYNC(vp, MNT_WAIT, td);
 		VOP_UNLOCK(vp);
 		vn_finished_write(mp);
 		return (error);
 	}
 
 	auio.uio_offset = (vm_ooffset_t)bp->bio_offset;
 	auio.uio_resid = bp->bio_length;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_td = td;
 
 	if (bp->bio_cmd == BIO_DELETE) {
 		/*
 		 * Emulate BIO_DELETE by writing zeros.
 		 */
 		zerosize = ZERO_REGION_SIZE -
 		    (ZERO_REGION_SIZE % sc->sectorsize);
 		auio.uio_iovcnt = howmany(bp->bio_length, zerosize);
 		piov = malloc(sizeof(*piov) * auio.uio_iovcnt, M_MD, M_WAITOK);
 		auio.uio_iov = piov;
 		while (len > 0) {
 			piov->iov_base = __DECONST(void *, zero_region);
 			piov->iov_len = len;
 			if (len > zerosize)
 				piov->iov_len = zerosize;
 			len -= piov->iov_len;
 			piov++;
 		}
 		piov = auio.uio_iov;
 	} else if ((bp->bio_flags & BIO_VLIST) != 0) {
 		piov = malloc(sizeof(*piov) * bp->bio_ma_n, M_MD, M_WAITOK);
 		auio.uio_iov = piov;
 		vlist = (bus_dma_segment_t *)bp->bio_data;
 		while (len > 0) {
 			piov->iov_base = (void *)(uintptr_t)(vlist->ds_addr +
 			    ma_offs);
 			piov->iov_len = vlist->ds_len - ma_offs;
 			if (piov->iov_len > len)
 				piov->iov_len = len;
 			len -= piov->iov_len;
 			ma_offs = 0;
 			vlist++;
 			piov++;
 		}
 		auio.uio_iovcnt = piov - auio.uio_iov;
 		piov = auio.uio_iov;
 	} else if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
 		pb = uma_zalloc(md_pbuf_zone, M_WAITOK);
 		bp->bio_resid = len;
 unmapped_step:
 		npages = atop(min(MAXPHYS, round_page(len + (ma_offs &
 		    PAGE_MASK))));
 		iolen = min(ptoa(npages) - (ma_offs & PAGE_MASK), len);
 		KASSERT(iolen > 0, ("zero iolen"));
 		pmap_qenter((vm_offset_t)pb->b_data,
 		    &bp->bio_ma[atop(ma_offs)], npages);
 		aiov.iov_base = (void *)((vm_offset_t)pb->b_data +
 		    (ma_offs & PAGE_MASK));
 		aiov.iov_len = iolen;
 		auio.uio_iov = &aiov;
 		auio.uio_iovcnt = 1;
 		auio.uio_resid = iolen;
 	} else {
 		aiov.iov_base = bp->bio_data;
 		aiov.iov_len = bp->bio_length;
 		auio.uio_iov = &aiov;
 		auio.uio_iovcnt = 1;
 	}
 	iostart = auio.uio_offset;
 	if (auio.uio_rw == UIO_READ) {
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 		error = VOP_READ(vp, &auio, 0, sc->cred);
 		VOP_UNLOCK(vp);
 	} else {
 		(void) vn_start_write(vp, &mp, V_WAIT);
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 		error = VOP_WRITE(vp, &auio, sc->flags & MD_ASYNC ? 0 : IO_SYNC,
 		    sc->cred);
 		VOP_UNLOCK(vp);
 		vn_finished_write(mp);
 		if (error == 0)
 			sc->flags &= ~MD_VERIFY;
 	}
 
 	/* When MD_CACHE is set, try to avoid double-caching the data. */
 	if (error == 0 && (sc->flags & MD_CACHE) == 0)
 		VOP_ADVISE(vp, iostart, auio.uio_offset - 1,
 		    POSIX_FADV_DONTNEED);
 
 	if (pb != NULL) {
 		pmap_qremove((vm_offset_t)pb->b_data, npages);
 		if (error == 0) {
 			len -= iolen;
 			bp->bio_resid -= iolen;
 			ma_offs += iolen;
 			if (len > 0)
 				goto unmapped_step;
 		}
 		uma_zfree(md_pbuf_zone, pb);
 	}
 
 	free(piov, M_MD);
 	if (pb == NULL)
 		bp->bio_resid = auio.uio_resid;
 	return (error);
 }
 
 static int
 mdstart_swap(struct md_s *sc, struct bio *bp)
 {
 	vm_page_t m;
 	u_char *p;
 	vm_pindex_t i, lastp;
 	bus_dma_segment_t *vlist;
 	int rv, ma_offs, offs, len, lastend;
 
 	switch (bp->bio_cmd) {
 	case BIO_READ:
 	case BIO_WRITE:
 	case BIO_DELETE:
 		break;
 	default:
 		return (EOPNOTSUPP);
 	}
 
 	p = bp->bio_data;
 	ma_offs = (bp->bio_flags & (BIO_UNMAPPED|BIO_VLIST)) != 0 ?
 	    bp->bio_ma_offset : 0;
 	vlist = (bp->bio_flags & BIO_VLIST) != 0 ?
 	    (bus_dma_segment_t *)bp->bio_data : NULL;
 
 	/*
 	 * offs is the offset at which to start operating on the
 	 * next (ie, first) page.  lastp is the last page on
 	 * which we're going to operate.  lastend is the ending
 	 * position within that last page (ie, PAGE_SIZE if
 	 * we're operating on complete aligned pages).
 	 */
 	offs = bp->bio_offset % PAGE_SIZE;
 	lastp = (bp->bio_offset + bp->bio_length - 1) / PAGE_SIZE;
 	lastend = (bp->bio_offset + bp->bio_length - 1) % PAGE_SIZE + 1;
 
 	rv = VM_PAGER_OK;
 	vm_object_pip_add(sc->object, 1);
 	for (i = bp->bio_offset / PAGE_SIZE; i <= lastp; i++) {
 		len = ((i == lastp) ? lastend : PAGE_SIZE) - offs;
 		m = vm_page_grab_unlocked(sc->object, i, VM_ALLOC_SYSTEM);
 		if (bp->bio_cmd == BIO_READ) {
 			if (vm_page_all_valid(m))
 				rv = VM_PAGER_OK;
 			else
 				rv = vm_pager_get_pages(sc->object, &m, 1,
 				    NULL, NULL);
 			if (rv == VM_PAGER_ERROR) {
 				VM_OBJECT_WLOCK(sc->object);
 				vm_page_free(m);
 				VM_OBJECT_WUNLOCK(sc->object);
 				break;
 			} else if (rv == VM_PAGER_FAIL) {
 				/*
 				 * Pager does not have the page.  Zero
 				 * the allocated page, and mark it as
 				 * valid. Do not set dirty, the page
 				 * can be recreated if thrown out.
 				 */
 				pmap_zero_page(m);
 				vm_page_valid(m);
 			}
 			if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
 				pmap_copy_pages(&m, offs, bp->bio_ma,
 				    ma_offs, len);
 			} else if ((bp->bio_flags & BIO_VLIST) != 0) {
 				physcopyout_vlist(VM_PAGE_TO_PHYS(m) + offs,
 				    vlist, ma_offs, len);
 				cpu_flush_dcache(p, len);
 			} else {
 				physcopyout(VM_PAGE_TO_PHYS(m) + offs, p, len);
 				cpu_flush_dcache(p, len);
 			}
 		} else if (bp->bio_cmd == BIO_WRITE) {
 			if (len == PAGE_SIZE || vm_page_all_valid(m))
 				rv = VM_PAGER_OK;
 			else
 				rv = vm_pager_get_pages(sc->object, &m, 1,
 				    NULL, NULL);
 			if (rv == VM_PAGER_ERROR) {
 				VM_OBJECT_WLOCK(sc->object);
 				vm_page_free(m);
 				VM_OBJECT_WUNLOCK(sc->object);
 				break;
 			} else if (rv == VM_PAGER_FAIL)
 				pmap_zero_page(m);
 
 			if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
 				pmap_copy_pages(bp->bio_ma, ma_offs, &m,
 				    offs, len);
 			} else if ((bp->bio_flags & BIO_VLIST) != 0) {
 				physcopyin_vlist(vlist, ma_offs,
 				    VM_PAGE_TO_PHYS(m) + offs, len);
 			} else {
 				physcopyin(p, VM_PAGE_TO_PHYS(m) + offs, len);
 			}
 
 			vm_page_valid(m);
 			vm_page_set_dirty(m);
 		} else if (bp->bio_cmd == BIO_DELETE) {
 			if (len == PAGE_SIZE || vm_page_all_valid(m))
 				rv = VM_PAGER_OK;
 			else
 				rv = vm_pager_get_pages(sc->object, &m, 1,
 				    NULL, NULL);
 			VM_OBJECT_WLOCK(sc->object);
 			if (rv == VM_PAGER_ERROR) {
 				vm_page_free(m);
 				VM_OBJECT_WUNLOCK(sc->object);
 				break;
 			} else if (rv == VM_PAGER_FAIL) {
 				vm_page_free(m);
 				m = NULL;
 			} else {
 				/* Page is valid. */
 				if (len != PAGE_SIZE) {
 					pmap_zero_page_area(m, offs, len);
 					vm_page_set_dirty(m);
 				} else {
 					vm_pager_page_unswapped(m);
 					vm_page_free(m);
 					m = NULL;
 				}
 			}
 			VM_OBJECT_WUNLOCK(sc->object);
 		}
 		if (m != NULL) {
 			vm_page_xunbusy(m);
 
 			/*
 			 * The page may be deactivated prior to setting
 			 * PGA_REFERENCED, but in this case it will be
 			 * reactivated by the page daemon.
 			 */
 			if (vm_page_active(m))
 				vm_page_reference(m);
 			else
 				vm_page_activate(m);
 		}
 
 		/* Actions on further pages start at offset 0 */
 		p += PAGE_SIZE - offs;
 		offs = 0;
 		ma_offs += len;
 	}
 	vm_object_pip_wakeup(sc->object);
 	return (rv != VM_PAGER_ERROR ? 0 : ENOSPC);
 }
 
 static int
 mdstart_null(struct md_s *sc, struct bio *bp)
 {
 
 	switch (bp->bio_cmd) {
 	case BIO_READ:
 		bzero(bp->bio_data, bp->bio_length);
 		cpu_flush_dcache(bp->bio_data, bp->bio_length);
 		break;
 	case BIO_WRITE:
 		break;
 	}
 	bp->bio_resid = 0;
 	return (0);
 }
 
 static void
 md_kthread(void *arg)
 {
 	struct md_s *sc;
 	struct bio *bp;
 	int error;
 
 	sc = arg;
 	thread_lock(curthread);
 	sched_prio(curthread, PRIBIO);
 	thread_unlock(curthread);
 	if (sc->type == MD_VNODE)
 		curthread->td_pflags |= TDP_NORUNNINGBUF;
 
 	for (;;) {
 		mtx_lock(&sc->queue_mtx);
 		if (sc->flags & MD_SHUTDOWN) {
 			sc->flags |= MD_EXITING;
 			mtx_unlock(&sc->queue_mtx);
 			kproc_exit(0);
 		}
 		bp = bioq_takefirst(&sc->bio_queue);
 		if (!bp) {
 			msleep(sc, &sc->queue_mtx, PRIBIO | PDROP, "mdwait", 0);
 			continue;
 		}
 		mtx_unlock(&sc->queue_mtx);
 		if (bp->bio_cmd == BIO_GETATTR) {
 			int isv = ((sc->flags & MD_VERIFY) != 0);
 
 			if ((sc->fwsectors && sc->fwheads &&
 			    (g_handleattr_int(bp, "GEOM::fwsectors",
 			    sc->fwsectors) ||
 			    g_handleattr_int(bp, "GEOM::fwheads",
 			    sc->fwheads))) ||
 			    g_handleattr_int(bp, "GEOM::candelete", 1))
 				error = -1;
 			else if (sc->ident[0] != '\0' &&
 			    g_handleattr_str(bp, "GEOM::ident", sc->ident))
 				error = -1;
 			else if (g_handleattr_int(bp, "MNT::verified", isv))
 				error = -1;
 			else
 				error = EOPNOTSUPP;
 		} else {
 			error = sc->start(sc, bp);
 		}
 
 		if (bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE) {
 			/*
 			 * Devstat uses (bio_bcount, bio_resid) for
 			 * determining the length of the completed part of
 			 * the i/o.  g_io_deliver() will translate from
 			 * bio_completed to that, but it also destroys the
 			 * bio so we must do our own translation.
 			 */
 			bp->bio_bcount = bp->bio_length;
 			bp->bio_resid = (error == -1 ? bp->bio_bcount : 0);
 			devstat_end_transaction_bio(sc->devstat, bp);
 		}
 		if (error != -1) {
 			bp->bio_completed = bp->bio_length;
 			g_io_deliver(bp, error);
 		}
 	}
 }
 
 static struct md_s *
 mdfind(int unit)
 {
 	struct md_s *sc;
 
 	LIST_FOREACH(sc, &md_softc_list, list) {
 		if (sc->unit == unit)
 			break;
 	}
 	return (sc);
 }
 
 static struct md_s *
 mdnew(int unit, int *errp, enum md_types type)
 {
 	struct md_s *sc;
 	int error;
 
 	*errp = 0;
 	if (unit == -1)
 		unit = alloc_unr(md_uh);
 	else
 		unit = alloc_unr_specific(md_uh, unit);
 
 	if (unit == -1) {
 		*errp = EBUSY;
 		return (NULL);
 	}
 
 	sc = (struct md_s *)malloc(sizeof *sc, M_MD, M_WAITOK | M_ZERO);
 	sc->type = type;
 	bioq_init(&sc->bio_queue);
 	mtx_init(&sc->queue_mtx, "md bio queue", NULL, MTX_DEF);
 	sc->unit = unit;
 	sprintf(sc->name, "md%d", unit);
 	LIST_INSERT_HEAD(&md_softc_list, sc, list);
 	error = kproc_create(md_kthread, sc, &sc->procp, 0, 0,"%s", sc->name);
 	if (error == 0)
 		return (sc);
 	LIST_REMOVE(sc, list);
 	mtx_destroy(&sc->queue_mtx);
 	free_unr(md_uh, sc->unit);
 	free(sc, M_MD);
 	*errp = error;
 	return (NULL);
 }
 
 static void
 mdinit(struct md_s *sc)
 {
 	struct g_geom *gp;
 	struct g_provider *pp;
 
 	g_topology_lock();
 	gp = g_new_geomf(&g_md_class, "md%d", sc->unit);
 	gp->softc = sc;
 	pp = g_new_providerf(gp, "md%d", sc->unit);
 	devstat_remove_entry(pp->stat);
 	pp->stat = NULL;
 	pp->flags |= G_PF_DIRECT_SEND | G_PF_DIRECT_RECEIVE;
 	pp->mediasize = sc->mediasize;
 	pp->sectorsize = sc->sectorsize;
 	switch (sc->type) {
 	case MD_MALLOC:
 	case MD_VNODE:
 	case MD_SWAP:
 		pp->flags |= G_PF_ACCEPT_UNMAPPED;
 		break;
 	case MD_PRELOAD:
 	case MD_NULL:
 		break;
 	}
 	sc->gp = gp;
 	sc->pp = pp;
 	sc->devstat = devstat_new_entry("md", sc->unit, sc->sectorsize,
 	    DEVSTAT_ALL_SUPPORTED, DEVSTAT_TYPE_DIRECT, DEVSTAT_PRIORITY_MAX);
 	sc->devstat->id = pp;
 	g_error_provider(pp, 0);
 	g_topology_unlock();
 }
 
 static int
 mdcreate_malloc(struct md_s *sc, struct md_req *mdr)
 {
 	uintptr_t sp;
 	int error;
 	off_t u;
 
 	error = 0;
 	if (mdr->md_options & ~(MD_AUTOUNIT | MD_COMPRESS | MD_RESERVE))
 		return (EINVAL);
 	if (mdr->md_sectorsize != 0 && !powerof2(mdr->md_sectorsize))
 		return (EINVAL);
 	/* Compression doesn't make sense if we have reserved space */
 	if (mdr->md_options & MD_RESERVE)
 		mdr->md_options &= ~MD_COMPRESS;
 	if (mdr->md_fwsectors != 0)
 		sc->fwsectors = mdr->md_fwsectors;
 	if (mdr->md_fwheads != 0)
 		sc->fwheads = mdr->md_fwheads;
 	sc->flags = mdr->md_options & (MD_COMPRESS | MD_FORCE);
 	sc->indir = dimension(sc->mediasize / sc->sectorsize);
 	sc->uma = uma_zcreate(sc->name, sc->sectorsize, NULL, NULL, NULL, NULL,
 	    0x1ff, 0);
 	if (mdr->md_options & MD_RESERVE) {
 		off_t nsectors;
 
 		nsectors = sc->mediasize / sc->sectorsize;
 		for (u = 0; u < nsectors; u++) {
 			sp = (uintptr_t)uma_zalloc(sc->uma, (md_malloc_wait ?
 			    M_WAITOK : M_NOWAIT) | M_ZERO);
 			if (sp != 0)
 				error = s_write(sc->indir, u, sp);
 			else
 				error = ENOMEM;
 			if (error != 0)
 				break;
 		}
 	}
 	return (error);
 }
 
 
 static int
 mdsetcred(struct md_s *sc, struct ucred *cred)
 {
 	char *tmpbuf;
 	int error = 0;
 
 	/*
 	 * Set credits in our softc
 	 */
 
 	if (sc->cred)
 		crfree(sc->cred);
 	sc->cred = crhold(cred);
 
 	/*
 	 * Horrible kludge to establish credentials for NFS  XXX.
 	 */
 
 	if (sc->vnode) {
 		struct uio auio;
 		struct iovec aiov;
 
 		tmpbuf = malloc(sc->sectorsize, M_TEMP, M_WAITOK);
 		bzero(&auio, sizeof(auio));
 
 		aiov.iov_base = tmpbuf;
 		aiov.iov_len = sc->sectorsize;
 		auio.uio_iov = &aiov;
 		auio.uio_iovcnt = 1;
 		auio.uio_offset = 0;
 		auio.uio_rw = UIO_READ;
 		auio.uio_segflg = UIO_SYSSPACE;
 		auio.uio_resid = aiov.iov_len;
 		vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY);
 		error = VOP_READ(sc->vnode, &auio, 0, sc->cred);
 		VOP_UNLOCK(sc->vnode);
 		free(tmpbuf, M_TEMP);
 	}
 	return (error);
 }
 
 static int
 mdcreate_vnode(struct md_s *sc, struct md_req *mdr, struct thread *td)
 {
 	struct vattr vattr;
 	struct nameidata nd;
 	char *fname;
 	int error, flags;
 
 	fname = mdr->md_file;
 	if (mdr->md_file_seg == UIO_USERSPACE) {
 		error = copyinstr(fname, sc->file, sizeof(sc->file), NULL);
 		if (error != 0)
 			return (error);
 	} else if (mdr->md_file_seg == UIO_SYSSPACE)
 		strlcpy(sc->file, fname, sizeof(sc->file));
 	else
 		return (EDOOFUS);
 
 	/*
 	 * If the user specified that this is a read only device, don't
 	 * set the FWRITE mask before trying to open the backing store.
 	 */
 	flags = FREAD | ((mdr->md_options & MD_READONLY) ? 0 : FWRITE) \
 	    | ((mdr->md_options & MD_VERIFY) ? O_VERIFY : 0);
 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, sc->file, td);
 	error = vn_open(&nd, &flags, 0, NULL);
 	if (error != 0)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	if (nd.ni_vp->v_type != VREG) {
 		error = EINVAL;
 		goto bad;
 	}
 	error = VOP_GETATTR(nd.ni_vp, &vattr, td->td_ucred);
 	if (error != 0)
 		goto bad;
 	if (VOP_ISLOCKED(nd.ni_vp) != LK_EXCLUSIVE) {
 		vn_lock(nd.ni_vp, LK_UPGRADE | LK_RETRY);
 		if (VN_IS_DOOMED(nd.ni_vp)) {
 			/* Forced unmount. */
 			error = EBADF;
 			goto bad;
 		}
 	}
 	nd.ni_vp->v_vflag |= VV_MD;
 	VOP_UNLOCK(nd.ni_vp);
 
 	if (mdr->md_fwsectors != 0)
 		sc->fwsectors = mdr->md_fwsectors;
 	if (mdr->md_fwheads != 0)
 		sc->fwheads = mdr->md_fwheads;
 	snprintf(sc->ident, sizeof(sc->ident), "MD-DEV%ju-INO%ju",
 	    (uintmax_t)vattr.va_fsid, (uintmax_t)vattr.va_fileid);
 	sc->flags = mdr->md_options & (MD_ASYNC | MD_CACHE | MD_FORCE |
 	    MD_VERIFY);
 	if (!(flags & FWRITE))
 		sc->flags |= MD_READONLY;
 	sc->vnode = nd.ni_vp;
 
 	error = mdsetcred(sc, td->td_ucred);
 	if (error != 0) {
 		sc->vnode = NULL;
 		vn_lock(nd.ni_vp, LK_EXCLUSIVE | LK_RETRY);
 		nd.ni_vp->v_vflag &= ~VV_MD;
 		goto bad;
 	}
 	return (0);
 bad:
 	VOP_UNLOCK(nd.ni_vp);
 	(void)vn_close(nd.ni_vp, flags, td->td_ucred, td);
 	return (error);
 }
 
 static void
 g_md_providergone(struct g_provider *pp)
 {
 	struct md_s *sc = pp->geom->softc;
 
 	mtx_lock(&sc->queue_mtx);
 	sc->flags |= MD_PROVIDERGONE;
 	wakeup(&sc->flags);
 	mtx_unlock(&sc->queue_mtx);
 }
 
 static int
 mddestroy(struct md_s *sc, struct thread *td)
 {
 
 	if (sc->gp) {
 		g_topology_lock();
 		g_wither_geom(sc->gp, ENXIO);
 		g_topology_unlock();
 
 		mtx_lock(&sc->queue_mtx);
 		while (!(sc->flags & MD_PROVIDERGONE))
 			msleep(&sc->flags, &sc->queue_mtx, PRIBIO, "mddestroy", 0);
 		mtx_unlock(&sc->queue_mtx);
 	}
 	if (sc->devstat) {
 		devstat_remove_entry(sc->devstat);
 		sc->devstat = NULL;
 	}
 	mtx_lock(&sc->queue_mtx);
 	sc->flags |= MD_SHUTDOWN;
 	wakeup(sc);
 	while (!(sc->flags & MD_EXITING))
 		msleep(sc->procp, &sc->queue_mtx, PRIBIO, "mddestroy", hz / 10);
 	mtx_unlock(&sc->queue_mtx);
 	mtx_destroy(&sc->queue_mtx);
 	if (sc->vnode != NULL) {
 		vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY);
 		sc->vnode->v_vflag &= ~VV_MD;
 		VOP_UNLOCK(sc->vnode);
 		(void)vn_close(sc->vnode, sc->flags & MD_READONLY ?
 		    FREAD : (FREAD|FWRITE), sc->cred, td);
 	}
 	if (sc->cred != NULL)
 		crfree(sc->cred);
 	if (sc->object != NULL)
 		vm_object_deallocate(sc->object);
 	if (sc->indir)
 		destroy_indir(sc, sc->indir);
 	if (sc->uma)
 		uma_zdestroy(sc->uma);
 
 	LIST_REMOVE(sc, list);
 	free_unr(md_uh, sc->unit);
 	free(sc, M_MD);
 	return (0);
 }
 
 static int
 mdresize(struct md_s *sc, struct md_req *mdr)
 {
 	int error, res;
 	vm_pindex_t oldpages, newpages;
 
 	switch (sc->type) {
 	case MD_VNODE:
 	case MD_NULL:
 		break;
 	case MD_SWAP:
 		if (mdr->md_mediasize <= 0 ||
 		    (mdr->md_mediasize % PAGE_SIZE) != 0)
 			return (EDOM);
 		oldpages = OFF_TO_IDX(round_page(sc->mediasize));
 		newpages = OFF_TO_IDX(round_page(mdr->md_mediasize));
 		if (newpages < oldpages) {
 			VM_OBJECT_WLOCK(sc->object);
 			vm_object_page_remove(sc->object, newpages, 0, 0);
-			swap_pager_freespace(sc->object, newpages,
-			    oldpages - newpages);
 			swap_release_by_cred(IDX_TO_OFF(oldpages -
 			    newpages), sc->cred);
 			sc->object->charge = IDX_TO_OFF(newpages);
 			sc->object->size = newpages;
 			VM_OBJECT_WUNLOCK(sc->object);
 		} else if (newpages > oldpages) {
 			res = swap_reserve_by_cred(IDX_TO_OFF(newpages -
 			    oldpages), sc->cred);
 			if (!res)
 				return (ENOMEM);
 			if ((mdr->md_options & MD_RESERVE) ||
 			    (sc->flags & MD_RESERVE)) {
 				error = swap_pager_reserve(sc->object,
 				    oldpages, newpages - oldpages);
 				if (error < 0) {
 					swap_release_by_cred(
 					    IDX_TO_OFF(newpages - oldpages),
 					    sc->cred);
 					return (EDOM);
 				}
 			}
 			VM_OBJECT_WLOCK(sc->object);
 			sc->object->charge = IDX_TO_OFF(newpages);
 			sc->object->size = newpages;
 			VM_OBJECT_WUNLOCK(sc->object);
 		}
 		break;
 	default:
 		return (EOPNOTSUPP);
 	}
 
 	sc->mediasize = mdr->md_mediasize;
 	g_topology_lock();
 	g_resize_provider(sc->pp, sc->mediasize);
 	g_topology_unlock();
 	return (0);
 }
 
 static int
 mdcreate_swap(struct md_s *sc, struct md_req *mdr, struct thread *td)
 {
 	vm_ooffset_t npage;
 	int error;
 
 	/*
 	 * Range check.  Disallow negative sizes and sizes not being
 	 * multiple of page size.
 	 */
 	if (sc->mediasize <= 0 || (sc->mediasize % PAGE_SIZE) != 0)
 		return (EDOM);
 
 	/*
 	 * Allocate an OBJT_SWAP object.
 	 *
 	 * Note the truncation.
 	 */
 
 	if ((mdr->md_options & MD_VERIFY) != 0)
 		return (EINVAL);
 	npage = mdr->md_mediasize / PAGE_SIZE;
 	if (mdr->md_fwsectors != 0)
 		sc->fwsectors = mdr->md_fwsectors;
 	if (mdr->md_fwheads != 0)
 		sc->fwheads = mdr->md_fwheads;
 	sc->object = vm_pager_allocate(OBJT_SWAP, NULL, PAGE_SIZE * npage,
 	    VM_PROT_DEFAULT, 0, td->td_ucred);
 	if (sc->object == NULL)
 		return (ENOMEM);
 	sc->flags = mdr->md_options & (MD_FORCE | MD_RESERVE);
 	if (mdr->md_options & MD_RESERVE) {
 		if (swap_pager_reserve(sc->object, 0, npage) < 0) {
 			error = EDOM;
 			goto finish;
 		}
 	}
 	error = mdsetcred(sc, td->td_ucred);
  finish:
 	if (error != 0) {
 		vm_object_deallocate(sc->object);
 		sc->object = NULL;
 	}
 	return (error);
 }
 
 static int
 mdcreate_null(struct md_s *sc, struct md_req *mdr, struct thread *td)
 {
 
 	/*
 	 * Range check.  Disallow negative sizes and sizes not being
 	 * multiple of page size.
 	 */
 	if (sc->mediasize <= 0 || (sc->mediasize % PAGE_SIZE) != 0)
 		return (EDOM);
 
 	return (0);
 }
 
 static int
 kern_mdattach_locked(struct thread *td, struct md_req *mdr)
 {
 	struct md_s *sc;
 	unsigned sectsize;
 	int error, i;
 
 	sx_assert(&md_sx, SA_XLOCKED);
 
 	switch (mdr->md_type) {
 	case MD_MALLOC:
 	case MD_PRELOAD:
 	case MD_VNODE:
 	case MD_SWAP:
 	case MD_NULL:
 		break;
 	default:
 		return (EINVAL);
 	}
 	if (mdr->md_sectorsize == 0)
 		sectsize = DEV_BSIZE;
 	else
 		sectsize = mdr->md_sectorsize;
 	if (sectsize > MAXPHYS || mdr->md_mediasize < sectsize)
 		return (EINVAL);
 	if (mdr->md_options & MD_AUTOUNIT)
 		sc = mdnew(-1, &error, mdr->md_type);
 	else {
 		if (mdr->md_unit > INT_MAX)
 			return (EINVAL);
 		sc = mdnew(mdr->md_unit, &error, mdr->md_type);
 	}
 	if (sc == NULL)
 		return (error);
 	if (mdr->md_label != NULL)
 		error = copyinstr(mdr->md_label, sc->label,
 		    sizeof(sc->label), NULL);
 	if (error != 0)
 		goto err_after_new;
 	if (mdr->md_options & MD_AUTOUNIT)
 		mdr->md_unit = sc->unit;
 	sc->mediasize = mdr->md_mediasize;
 	sc->sectorsize = sectsize;
 	error = EDOOFUS;
 	switch (sc->type) {
 	case MD_MALLOC:
 		sc->start = mdstart_malloc;
 		error = mdcreate_malloc(sc, mdr);
 		break;
 	case MD_PRELOAD:
 		/*
 		 * We disallow attaching preloaded memory disks via
 		 * ioctl. Preloaded memory disks are automatically
 		 * attached in g_md_init().
 		 */
 		error = EOPNOTSUPP;
 		break;
 	case MD_VNODE:
 		sc->start = mdstart_vnode;
 		error = mdcreate_vnode(sc, mdr, td);
 		break;
 	case MD_SWAP:
 		sc->start = mdstart_swap;
 		error = mdcreate_swap(sc, mdr, td);
 		break;
 	case MD_NULL:
 		sc->start = mdstart_null;
 		error = mdcreate_null(sc, mdr, td);
 		break;
 	}
 err_after_new:
 	if (error != 0) {
 		mddestroy(sc, td);
 		return (error);
 	}
 
 	/* Prune off any residual fractional sector */
 	i = sc->mediasize % sc->sectorsize;
 	sc->mediasize -= i;
 
 	mdinit(sc);
 	return (0);
 }
 
 static int
 kern_mdattach(struct thread *td, struct md_req *mdr)
 {
 	int error;
 
 	sx_xlock(&md_sx);
 	error = kern_mdattach_locked(td, mdr);
 	sx_xunlock(&md_sx);
 	return (error);
 }
 
 static int
 kern_mddetach_locked(struct thread *td, struct md_req *mdr)
 {
 	struct md_s *sc;
 
 	sx_assert(&md_sx, SA_XLOCKED);
 
 	if (mdr->md_mediasize != 0 ||
 	    (mdr->md_options & ~MD_FORCE) != 0)
 		return (EINVAL);
 
 	sc = mdfind(mdr->md_unit);
 	if (sc == NULL)
 		return (ENOENT);
 	if (sc->opencount != 0 && !(sc->flags & MD_FORCE) &&
 	    !(mdr->md_options & MD_FORCE))
 		return (EBUSY);
 	return (mddestroy(sc, td));
 }
 
 static int
 kern_mddetach(struct thread *td, struct md_req *mdr)
 {
 	int error;
 
 	sx_xlock(&md_sx);
 	error = kern_mddetach_locked(td, mdr);
 	sx_xunlock(&md_sx);
 	return (error);
 }
 
 static int
 kern_mdresize_locked(struct md_req *mdr)
 {
 	struct md_s *sc;
 
 	sx_assert(&md_sx, SA_XLOCKED);
 
 	if ((mdr->md_options & ~(MD_FORCE | MD_RESERVE)) != 0)
 		return (EINVAL);
 
 	sc = mdfind(mdr->md_unit);
 	if (sc == NULL)
 		return (ENOENT);
 	if (mdr->md_mediasize < sc->sectorsize)
 		return (EINVAL);
 	if (mdr->md_mediasize < sc->mediasize &&
 	    !(sc->flags & MD_FORCE) &&
 	    !(mdr->md_options & MD_FORCE))
 		return (EBUSY);
 	return (mdresize(sc, mdr));
 }
 
 static int
 kern_mdresize(struct md_req *mdr)
 {
 	int error;
 
 	sx_xlock(&md_sx);
 	error = kern_mdresize_locked(mdr);
 	sx_xunlock(&md_sx);
 	return (error);
 }
 
 static int
 kern_mdquery_locked(struct md_req *mdr)
 {
 	struct md_s *sc;
 	int error;
 
 	sx_assert(&md_sx, SA_XLOCKED);
 
 	sc = mdfind(mdr->md_unit);
 	if (sc == NULL)
 		return (ENOENT);
 	mdr->md_type = sc->type;
 	mdr->md_options = sc->flags;
 	mdr->md_mediasize = sc->mediasize;
 	mdr->md_sectorsize = sc->sectorsize;
 	error = 0;
 	if (mdr->md_label != NULL) {
 		error = copyout(sc->label, mdr->md_label,
 		    strlen(sc->label) + 1);
 		if (error != 0)
 			return (error);
 	}
 	if (sc->type == MD_VNODE ||
 	    (sc->type == MD_PRELOAD && mdr->md_file != NULL))
 		error = copyout(sc->file, mdr->md_file,
 		    strlen(sc->file) + 1);
 	return (error);
 }
 
 static int
 kern_mdquery(struct md_req *mdr)
 {
 	int error;
 
 	sx_xlock(&md_sx);
 	error = kern_mdquery_locked(mdr);
 	sx_xunlock(&md_sx);
 	return (error);
 }
 
 /* Copy members that are not userspace pointers. */
 #define	MD_IOCTL2REQ(mdio, mdr) do {					\
 	(mdr)->md_unit = (mdio)->md_unit;				\
 	(mdr)->md_type = (mdio)->md_type;				\
 	(mdr)->md_mediasize = (mdio)->md_mediasize;			\
 	(mdr)->md_sectorsize = (mdio)->md_sectorsize;			\
 	(mdr)->md_options = (mdio)->md_options;				\
 	(mdr)->md_fwheads = (mdio)->md_fwheads;				\
 	(mdr)->md_fwsectors = (mdio)->md_fwsectors;			\
 	(mdr)->md_units = &(mdio)->md_pad[0];				\
 	(mdr)->md_units_nitems = nitems((mdio)->md_pad);		\
 } while(0)
 
 /* Copy members that might have been updated */
 #define MD_REQ2IOCTL(mdr, mdio) do {					\
 	(mdio)->md_unit = (mdr)->md_unit;				\
 	(mdio)->md_type = (mdr)->md_type;				\
 	(mdio)->md_mediasize = (mdr)->md_mediasize;			\
 	(mdio)->md_sectorsize = (mdr)->md_sectorsize;			\
 	(mdio)->md_options = (mdr)->md_options;				\
 	(mdio)->md_fwheads = (mdr)->md_fwheads;				\
 	(mdio)->md_fwsectors = (mdr)->md_fwsectors;			\
 } while(0)
 
 static int
 mdctlioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags,
     struct thread *td)
 {
 	struct md_req mdr;
 	int error;
 
 	if (md_debug)
 		printf("mdctlioctl(%s %lx %p %x %p)\n",
 			devtoname(dev), cmd, addr, flags, td);
 
 	bzero(&mdr, sizeof(mdr));
 	switch (cmd) {
 	case MDIOCATTACH:
 	case MDIOCDETACH:
 	case MDIOCRESIZE:
 	case MDIOCQUERY: {
 		struct md_ioctl *mdio = (struct md_ioctl *)addr;
 		if (mdio->md_version != MDIOVERSION)
 			return (EINVAL);
 		MD_IOCTL2REQ(mdio, &mdr);
 		mdr.md_file = mdio->md_file;
 		mdr.md_file_seg = UIO_USERSPACE;
 		/* If the file is adjacent to the md_ioctl it's in kernel. */
 		if ((void *)mdio->md_file == (void *)(mdio + 1))
 			mdr.md_file_seg = UIO_SYSSPACE;
 		mdr.md_label = mdio->md_label;
 		break;
 	}
 #ifdef COMPAT_FREEBSD32
 	case MDIOCATTACH_32:
 	case MDIOCDETACH_32:
 	case MDIOCRESIZE_32:
 	case MDIOCQUERY_32: {
 		struct md_ioctl32 *mdio = (struct md_ioctl32 *)addr;
 		if (mdio->md_version != MDIOVERSION)
 			return (EINVAL);
 		MD_IOCTL2REQ(mdio, &mdr);
 		mdr.md_file = (void *)(uintptr_t)mdio->md_file;
 		mdr.md_file_seg = UIO_USERSPACE;
 		mdr.md_label = (void *)(uintptr_t)mdio->md_label;
 		break;
 	}
 #endif
 	default:
 		/* Fall through to handler switch. */
 		break;
 	}
 
 	error = 0;
 	switch (cmd) {
 	case MDIOCATTACH:
 #ifdef COMPAT_FREEBSD32
 	case MDIOCATTACH_32:
 #endif
 		error = kern_mdattach(td, &mdr);
 		break;
 	case MDIOCDETACH:
 #ifdef COMPAT_FREEBSD32
 	case MDIOCDETACH_32:
 #endif
 		error = kern_mddetach(td, &mdr);
 		break;
 	case MDIOCRESIZE:
 #ifdef COMPAT_FREEBSD32
 	case MDIOCRESIZE_32:
 #endif
 		error = kern_mdresize(&mdr);
 		break;
 	case MDIOCQUERY:
 #ifdef COMPAT_FREEBSD32
 	case MDIOCQUERY_32:
 #endif
 		error = kern_mdquery(&mdr);
 		break;
 	default:
 		error = ENOIOCTL;
 	}
 
 	switch (cmd) {
 	case MDIOCATTACH:
 	case MDIOCQUERY: {
 		struct md_ioctl *mdio = (struct md_ioctl *)addr;
 		MD_REQ2IOCTL(&mdr, mdio);
 		break;
 	}
 #ifdef COMPAT_FREEBSD32
 	case MDIOCATTACH_32:
 	case MDIOCQUERY_32: {
 		struct md_ioctl32 *mdio = (struct md_ioctl32 *)addr;
 		MD_REQ2IOCTL(&mdr, mdio);
 		break;
 	}
 #endif
 	default:
 		/* Other commands to not alter mdr. */
 		break;
 	}
 
 	return (error);
 }
 
 static void
 md_preloaded(u_char *image, size_t length, const char *name)
 {
 	struct md_s *sc;
 	int error;
 
 	sc = mdnew(-1, &error, MD_PRELOAD);
 	if (sc == NULL)
 		return;
 	sc->mediasize = length;
 	sc->sectorsize = DEV_BSIZE;
 	sc->pl_ptr = image;
 	sc->pl_len = length;
 	sc->start = mdstart_preload;
 	if (name != NULL)
 		strlcpy(sc->file, name, sizeof(sc->file));
 #ifdef MD_ROOT
 	if (sc->unit == 0) {
 #ifndef ROOTDEVNAME
 		rootdevnames[0] = MD_ROOT_FSTYPE ":/dev/md0";
 #endif
 #ifdef MD_ROOT_READONLY
 		sc->flags |= MD_READONLY;
 #endif
 	}
 #endif
 	mdinit(sc);
 	if (name != NULL) {
 		printf("%s%d: Preloaded image <%s> %zd bytes at %p\n",
 		    MD_NAME, sc->unit, name, length, image);
 	} else {
 		printf("%s%d: Embedded image %zd bytes at %p\n",
 		    MD_NAME, sc->unit, length, image);
 	}
 }
 
 static void
 g_md_init(struct g_class *mp __unused)
 {
 	caddr_t mod;
 	u_char *ptr, *name, *type;
 	unsigned len;
 	int i;
 
 	/* figure out log2(NINDIR) */
 	for (i = NINDIR, nshift = -1; i; nshift++)
 		i >>= 1;
 
 	mod = NULL;
 	sx_init(&md_sx, "MD config lock");
 	g_topology_unlock();
 	md_uh = new_unrhdr(0, INT_MAX, NULL);
 #ifdef MD_ROOT
 	if (mfs_root_size != 0) {
 		sx_xlock(&md_sx);
 #ifdef MD_ROOT_MEM
 		md_preloaded(mfs_root, mfs_root_size, NULL);
 #else
 		md_preloaded(__DEVOLATILE(u_char *, &mfs_root), mfs_root_size,
 		    NULL);
 #endif
 		sx_xunlock(&md_sx);
 	}
 #endif
 	/* XXX: are preload_* static or do they need Giant ? */
 	while ((mod = preload_search_next_name(mod)) != NULL) {
 		name = (char *)preload_search_info(mod, MODINFO_NAME);
 		if (name == NULL)
 			continue;
 		type = (char *)preload_search_info(mod, MODINFO_TYPE);
 		if (type == NULL)
 			continue;
 		if (strcmp(type, "md_image") && strcmp(type, "mfs_root"))
 			continue;
 		ptr = preload_fetch_addr(mod);
 		len = preload_fetch_size(mod);
 		if (ptr != NULL && len != 0) {
 			sx_xlock(&md_sx);
 			md_preloaded(ptr, len, name);
 			sx_xunlock(&md_sx);
 		}
 	}
 	md_pbuf_zone = pbuf_zsecond_create("mdpbuf", nswbuf / 10);
 	status_dev = make_dev(&mdctl_cdevsw, INT_MAX, UID_ROOT, GID_WHEEL,
 	    0600, MDCTL_NAME);
 	g_topology_lock();
 }
 
 static void
 g_md_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
     struct g_consumer *cp __unused, struct g_provider *pp)
 {
 	struct md_s *mp;
 	char *type;
 
 	mp = gp->softc;
 	if (mp == NULL)
 		return;
 
 	switch (mp->type) {
 	case MD_MALLOC:
 		type = "malloc";
 		break;
 	case MD_PRELOAD:
 		type = "preload";
 		break;
 	case MD_VNODE:
 		type = "vnode";
 		break;
 	case MD_SWAP:
 		type = "swap";
 		break;
 	case MD_NULL:
 		type = "null";
 		break;
 	default:
 		type = "unknown";
 		break;
 	}
 
 	if (pp != NULL) {
 		if (indent == NULL) {
 			sbuf_printf(sb, " u %d", mp->unit);
 			sbuf_printf(sb, " s %ju", (uintmax_t) mp->sectorsize);
 			sbuf_printf(sb, " f %ju", (uintmax_t) mp->fwheads);
 			sbuf_printf(sb, " fs %ju", (uintmax_t) mp->fwsectors);
 			sbuf_printf(sb, " l %ju", (uintmax_t) mp->mediasize);
 			sbuf_printf(sb, " t %s", type);
 			if ((mp->type == MD_VNODE && mp->vnode != NULL) ||
 			    (mp->type == MD_PRELOAD && mp->file[0] != '\0'))
 				sbuf_printf(sb, " file %s", mp->file);
 			sbuf_printf(sb, " label %s", mp->label);
 		} else {
 			sbuf_printf(sb, "%s<unit>%d</unit>\n", indent,
 			    mp->unit);
 			sbuf_printf(sb, "%s<sectorsize>%ju</sectorsize>\n",
 			    indent, (uintmax_t) mp->sectorsize);
 			sbuf_printf(sb, "%s<fwheads>%ju</fwheads>\n",
 			    indent, (uintmax_t) mp->fwheads);
 			sbuf_printf(sb, "%s<fwsectors>%ju</fwsectors>\n",
 			    indent, (uintmax_t) mp->fwsectors);
 			if (mp->ident[0] != '\0') {
 				sbuf_printf(sb, "%s<ident>", indent);
 				g_conf_printf_escaped(sb, "%s", mp->ident);
 				sbuf_printf(sb, "</ident>\n");
 			}
 			sbuf_printf(sb, "%s<length>%ju</length>\n",
 			    indent, (uintmax_t) mp->mediasize);
 			sbuf_printf(sb, "%s<compression>%s</compression>\n", indent,
 			    (mp->flags & MD_COMPRESS) == 0 ? "off": "on");
 			sbuf_printf(sb, "%s<access>%s</access>\n", indent,
 			    (mp->flags & MD_READONLY) == 0 ? "read-write":
 			    "read-only");
 			sbuf_printf(sb, "%s<type>%s</type>\n", indent,
 			    type);
 			if ((mp->type == MD_VNODE && mp->vnode != NULL) ||
 			    (mp->type == MD_PRELOAD && mp->file[0] != '\0')) {
 				sbuf_printf(sb, "%s<file>", indent);
 				g_conf_printf_escaped(sb, "%s", mp->file);
 				sbuf_printf(sb, "</file>\n");
 			}
 			if (mp->type == MD_VNODE)
 				sbuf_printf(sb, "%s<cache>%s</cache>\n", indent,
 				    (mp->flags & MD_CACHE) == 0 ? "off": "on");
 			sbuf_printf(sb, "%s<label>", indent);
 			g_conf_printf_escaped(sb, "%s", mp->label);
 			sbuf_printf(sb, "</label>\n");
 		}
 	}
 }
 
 static void
 g_md_fini(struct g_class *mp __unused)
 {
 
 	sx_destroy(&md_sx);
 	if (status_dev != NULL)
 		destroy_dev(status_dev);
 	uma_zdestroy(md_pbuf_zone);
 	delete_unrhdr(md_uh);
 }
Index: head/sys/fs/tmpfs/tmpfs_subr.c
===================================================================
--- head/sys/fs/tmpfs/tmpfs_subr.c	(revision 362612)
+++ head/sys/fs/tmpfs/tmpfs_subr.c	(revision 362613)
@@ -1,1934 +1,1931 @@
 /*	$NetBSD: tmpfs_subr.c,v 1.35 2007/07/09 21:10:50 ad Exp $	*/
 
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-NetBSD
  *
  * Copyright (c) 2005 The NetBSD Foundation, Inc.
  * All rights reserved.
  *
  * This code is derived from software contributed to The NetBSD Foundation
  * by Julio M. Merino Vidal, developed as part of Google's Summer of Code
  * 2005 program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
 /*
  * Efficient memory file system supporting functions.
  */
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/dirent.h>
 #include <sys/fnv_hash.h>
 #include <sys/lock.h>
 #include <sys/limits.h>
 #include <sys/mount.h>
 #include <sys/namei.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/random.h>
 #include <sys/rwlock.h>
 #include <sys/stat.h>
 #include <sys/sysctl.h>
 #include <sys/vnode.h>
 #include <sys/vmmeter.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_extern.h>
 #include <vm/swap_pager.h>
 
 #include <fs/tmpfs/tmpfs.h>
 #include <fs/tmpfs/tmpfs_fifoops.h>
 #include <fs/tmpfs/tmpfs_vnops.h>
 
 SYSCTL_NODE(_vfs, OID_AUTO, tmpfs, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "tmpfs file system");
 
 static long tmpfs_pages_reserved = TMPFS_PAGES_MINRESERVED;
 
 static uma_zone_t tmpfs_dirent_pool;
 static uma_zone_t tmpfs_node_pool;
 
 static int
 tmpfs_node_ctor(void *mem, int size, void *arg, int flags)
 {
 	struct tmpfs_node *node;
 
 	node = mem;
 	node->tn_gen++;
 	node->tn_size = 0;
 	node->tn_status = 0;
 	node->tn_flags = 0;
 	node->tn_links = 0;
 	node->tn_vnode = NULL;
 	node->tn_vpstate = 0;
 	return (0);
 }
 
 static void
 tmpfs_node_dtor(void *mem, int size, void *arg)
 {
 	struct tmpfs_node *node;
 
 	node = mem;
 	node->tn_type = VNON;
 }
 
 static int
 tmpfs_node_init(void *mem, int size, int flags)
 {
 	struct tmpfs_node *node;
 
 	node = mem;
 	node->tn_id = 0;
 	mtx_init(&node->tn_interlock, "tmpfsni", NULL, MTX_DEF);
 	node->tn_gen = arc4random();
 	return (0);
 }
 
 static void
 tmpfs_node_fini(void *mem, int size)
 {
 	struct tmpfs_node *node;
 
 	node = mem;
 	mtx_destroy(&node->tn_interlock);
 }
 
 void
 tmpfs_subr_init(void)
 {
 	tmpfs_dirent_pool = uma_zcreate("TMPFS dirent",
 	    sizeof(struct tmpfs_dirent), NULL, NULL, NULL, NULL,
 	    UMA_ALIGN_PTR, 0);
 	tmpfs_node_pool = uma_zcreate("TMPFS node",
 	    sizeof(struct tmpfs_node), tmpfs_node_ctor, tmpfs_node_dtor,
 	    tmpfs_node_init, tmpfs_node_fini, UMA_ALIGN_PTR, 0);
 }
 
 void
 tmpfs_subr_uninit(void)
 {
 	uma_zdestroy(tmpfs_node_pool);
 	uma_zdestroy(tmpfs_dirent_pool);
 }
 
 static int
 sysctl_mem_reserved(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	long pages, bytes;
 
 	pages = *(long *)arg1;
 	bytes = pages * PAGE_SIZE;
 
 	error = sysctl_handle_long(oidp, &bytes, 0, req);
 	if (error || !req->newptr)
 		return (error);
 
 	pages = bytes / PAGE_SIZE;
 	if (pages < TMPFS_PAGES_MINRESERVED)
 		return (EINVAL);
 
 	*(long *)arg1 = pages;
 	return (0);
 }
 
 SYSCTL_PROC(_vfs_tmpfs, OID_AUTO, memory_reserved,
     CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &tmpfs_pages_reserved, 0,
     sysctl_mem_reserved, "L",
     "Amount of available memory and swap below which tmpfs growth stops");
 
 static __inline int tmpfs_dirtree_cmp(struct tmpfs_dirent *a,
     struct tmpfs_dirent *b);
 RB_PROTOTYPE_STATIC(tmpfs_dir, tmpfs_dirent, uh.td_entries, tmpfs_dirtree_cmp);
 
 size_t
 tmpfs_mem_avail(void)
 {
 	vm_ooffset_t avail;
 
 	avail = swap_pager_avail + vm_free_count() - tmpfs_pages_reserved;
 	if (__predict_false(avail < 0))
 		avail = 0;
 	return (avail);
 }
 
 size_t
 tmpfs_pages_used(struct tmpfs_mount *tmp)
 {
 	const size_t node_size = sizeof(struct tmpfs_node) +
 	    sizeof(struct tmpfs_dirent);
 	size_t meta_pages;
 
 	meta_pages = howmany((uintmax_t)tmp->tm_nodes_inuse * node_size,
 	    PAGE_SIZE);
 	return (meta_pages + tmp->tm_pages_used);
 }
 
 static size_t
 tmpfs_pages_check_avail(struct tmpfs_mount *tmp, size_t req_pages)
 {
 	if (tmpfs_mem_avail() < req_pages)
 		return (0);
 
 	if (tmp->tm_pages_max != ULONG_MAX &&
 	    tmp->tm_pages_max < req_pages + tmpfs_pages_used(tmp))
 			return (0);
 
 	return (1);
 }
 
 void
 tmpfs_ref_node(struct tmpfs_node *node)
 {
 
 	TMPFS_NODE_LOCK(node);
 	tmpfs_ref_node_locked(node);
 	TMPFS_NODE_UNLOCK(node);
 }
 
 void
 tmpfs_ref_node_locked(struct tmpfs_node *node)
 {
 
 	TMPFS_NODE_ASSERT_LOCKED(node);
 	KASSERT(node->tn_refcount > 0, ("node %p zero refcount", node));
 	KASSERT(node->tn_refcount < UINT_MAX, ("node %p refcount %u", node,
 	    node->tn_refcount));
 	node->tn_refcount++;
 }
 
 /*
  * Allocates a new node of type 'type' inside the 'tmp' mount point, with
  * its owner set to 'uid', its group to 'gid' and its mode set to 'mode',
  * using the credentials of the process 'p'.
  *
  * If the node type is set to 'VDIR', then the parent parameter must point
  * to the parent directory of the node being created.  It may only be NULL
  * while allocating the root node.
  *
  * If the node type is set to 'VBLK' or 'VCHR', then the rdev parameter
  * specifies the device the node represents.
  *
  * If the node type is set to 'VLNK', then the parameter target specifies
  * the file name of the target file for the symbolic link that is being
  * created.
  *
  * Note that new nodes are retrieved from the available list if it has
  * items or, if it is empty, from the node pool as long as there is enough
  * space to create them.
  *
  * Returns zero on success or an appropriate error code on failure.
  */
 int
 tmpfs_alloc_node(struct mount *mp, struct tmpfs_mount *tmp, enum vtype type,
     uid_t uid, gid_t gid, mode_t mode, struct tmpfs_node *parent,
     const char *target, dev_t rdev, struct tmpfs_node **node)
 {
 	struct tmpfs_node *nnode;
 	vm_object_t obj;
 
 	/* If the root directory of the 'tmp' file system is not yet
 	 * allocated, this must be the request to do it. */
 	MPASS(IMPLIES(tmp->tm_root == NULL, parent == NULL && type == VDIR));
 
 	MPASS(IFF(type == VLNK, target != NULL));
 	MPASS(IFF(type == VBLK || type == VCHR, rdev != VNOVAL));
 
 	if (tmp->tm_nodes_inuse >= tmp->tm_nodes_max)
 		return (ENOSPC);
 	if (tmpfs_pages_check_avail(tmp, 1) == 0)
 		return (ENOSPC);
 
 	if ((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0) {
 		/*
 		 * When a new tmpfs node is created for fully
 		 * constructed mount point, there must be a parent
 		 * node, which vnode is locked exclusively.  As
 		 * consequence, if the unmount is executing in
 		 * parallel, vflush() cannot reclaim the parent vnode.
 		 * Due to this, the check for MNTK_UNMOUNT flag is not
 		 * racy: if we did not see MNTK_UNMOUNT flag, then tmp
 		 * cannot be destroyed until node construction is
 		 * finished and the parent vnode unlocked.
 		 *
 		 * Tmpfs does not need to instantiate new nodes during
 		 * unmount.
 		 */
 		return (EBUSY);
 	}
 	if ((mp->mnt_kern_flag & MNT_RDONLY) != 0)
 		return (EROFS);
 
 	nnode = uma_zalloc_arg(tmpfs_node_pool, tmp, M_WAITOK);
 
 	/* Generic initialization. */
 	nnode->tn_type = type;
 	vfs_timestamp(&nnode->tn_atime);
 	nnode->tn_birthtime = nnode->tn_ctime = nnode->tn_mtime =
 	    nnode->tn_atime;
 	nnode->tn_uid = uid;
 	nnode->tn_gid = gid;
 	nnode->tn_mode = mode;
 	nnode->tn_id = alloc_unr64(&tmp->tm_ino_unr);
 	nnode->tn_refcount = 1;
 
 	/* Type-specific initialization. */
 	switch (nnode->tn_type) {
 	case VBLK:
 	case VCHR:
 		nnode->tn_rdev = rdev;
 		break;
 
 	case VDIR:
 		RB_INIT(&nnode->tn_dir.tn_dirhead);
 		LIST_INIT(&nnode->tn_dir.tn_dupindex);
 		MPASS(parent != nnode);
 		MPASS(IMPLIES(parent == NULL, tmp->tm_root == NULL));
 		nnode->tn_dir.tn_parent = (parent == NULL) ? nnode : parent;
 		nnode->tn_dir.tn_readdir_lastn = 0;
 		nnode->tn_dir.tn_readdir_lastp = NULL;
 		nnode->tn_links++;
 		TMPFS_NODE_LOCK(nnode->tn_dir.tn_parent);
 		nnode->tn_dir.tn_parent->tn_links++;
 		TMPFS_NODE_UNLOCK(nnode->tn_dir.tn_parent);
 		break;
 
 	case VFIFO:
 		/* FALLTHROUGH */
 	case VSOCK:
 		break;
 
 	case VLNK:
 		MPASS(strlen(target) < MAXPATHLEN);
 		nnode->tn_size = strlen(target);
 		nnode->tn_link = malloc(nnode->tn_size, M_TMPFSNAME,
 		    M_WAITOK);
 		memcpy(nnode->tn_link, target, nnode->tn_size);
 		break;
 
 	case VREG:
 		obj = nnode->tn_reg.tn_aobj =
 		    vm_pager_allocate(OBJT_SWAP, NULL, 0, VM_PROT_DEFAULT, 0,
 			NULL /* XXXKIB - tmpfs needs swap reservation */);
 		VM_OBJECT_WLOCK(obj);
 		/* OBJ_TMPFS is set together with the setting of vp->v_object */
 		vm_object_set_flag(obj, OBJ_TMPFS_NODE);
 		VM_OBJECT_WUNLOCK(obj);
 		break;
 
 	default:
 		panic("tmpfs_alloc_node: type %p %d", nnode,
 		    (int)nnode->tn_type);
 	}
 
 	TMPFS_LOCK(tmp);
 	LIST_INSERT_HEAD(&tmp->tm_nodes_used, nnode, tn_entries);
 	nnode->tn_attached = true;
 	tmp->tm_nodes_inuse++;
 	tmp->tm_refcount++;
 	TMPFS_UNLOCK(tmp);
 
 	*node = nnode;
 	return (0);
 }
 
 /*
  * Destroys the node pointed to by node from the file system 'tmp'.
  * If the node references a directory, no entries are allowed.
  */
 void
 tmpfs_free_node(struct tmpfs_mount *tmp, struct tmpfs_node *node)
 {
 
 	TMPFS_LOCK(tmp);
 	TMPFS_NODE_LOCK(node);
 	if (!tmpfs_free_node_locked(tmp, node, false)) {
 		TMPFS_NODE_UNLOCK(node);
 		TMPFS_UNLOCK(tmp);
 	}
 }
 
 bool
 tmpfs_free_node_locked(struct tmpfs_mount *tmp, struct tmpfs_node *node,
     bool detach)
 {
 	vm_object_t uobj;
 
 	TMPFS_MP_ASSERT_LOCKED(tmp);
 	TMPFS_NODE_ASSERT_LOCKED(node);
 	KASSERT(node->tn_refcount > 0, ("node %p refcount zero", node));
 
 	node->tn_refcount--;
 	if (node->tn_attached && (detach || node->tn_refcount == 0)) {
 		MPASS(tmp->tm_nodes_inuse > 0);
 		tmp->tm_nodes_inuse--;
 		LIST_REMOVE(node, tn_entries);
 		node->tn_attached = false;
 	}
 	if (node->tn_refcount > 0)
 		return (false);
 
 #ifdef INVARIANTS
 	MPASS(node->tn_vnode == NULL);
 	MPASS((node->tn_vpstate & TMPFS_VNODE_ALLOCATING) == 0);
 #endif
 	TMPFS_NODE_UNLOCK(node);
 	TMPFS_UNLOCK(tmp);
 
 	switch (node->tn_type) {
 	case VBLK:
 		/* FALLTHROUGH */
 	case VCHR:
 		/* FALLTHROUGH */
 	case VDIR:
 		/* FALLTHROUGH */
 	case VFIFO:
 		/* FALLTHROUGH */
 	case VSOCK:
 		break;
 
 	case VLNK:
 		free(node->tn_link, M_TMPFSNAME);
 		break;
 
 	case VREG:
 		uobj = node->tn_reg.tn_aobj;
 		if (uobj != NULL) {
 			if (uobj->size != 0)
 				atomic_subtract_long(&tmp->tm_pages_used, uobj->size);
 			KASSERT((uobj->flags & OBJ_TMPFS) == 0,
 			    ("leaked OBJ_TMPFS node %p vm_obj %p", node, uobj));
 			vm_object_deallocate(uobj);
 		}
 		break;
 
 	default:
 		panic("tmpfs_free_node: type %p %d", node, (int)node->tn_type);
 	}
 
 	uma_zfree(tmpfs_node_pool, node);
 	TMPFS_LOCK(tmp);
 	tmpfs_free_tmp(tmp);
 	return (true);
 }
 
 static __inline uint32_t
 tmpfs_dirent_hash(const char *name, u_int len)
 {
 	uint32_t hash;
 
 	hash = fnv_32_buf(name, len, FNV1_32_INIT + len) & TMPFS_DIRCOOKIE_MASK;
 #ifdef TMPFS_DEBUG_DIRCOOKIE_DUP
 	hash &= 0xf;
 #endif
 	if (hash < TMPFS_DIRCOOKIE_MIN)
 		hash += TMPFS_DIRCOOKIE_MIN;
 
 	return (hash);
 }
 
 static __inline off_t
 tmpfs_dirent_cookie(struct tmpfs_dirent *de)
 {
 	if (de == NULL)
 		return (TMPFS_DIRCOOKIE_EOF);
 
 	MPASS(de->td_cookie >= TMPFS_DIRCOOKIE_MIN);
 
 	return (de->td_cookie);
 }
 
 static __inline boolean_t
 tmpfs_dirent_dup(struct tmpfs_dirent *de)
 {
 	return ((de->td_cookie & TMPFS_DIRCOOKIE_DUP) != 0);
 }
 
 static __inline boolean_t
 tmpfs_dirent_duphead(struct tmpfs_dirent *de)
 {
 	return ((de->td_cookie & TMPFS_DIRCOOKIE_DUPHEAD) != 0);
 }
 
 void
 tmpfs_dirent_init(struct tmpfs_dirent *de, const char *name, u_int namelen)
 {
 	de->td_hash = de->td_cookie = tmpfs_dirent_hash(name, namelen);
 	memcpy(de->ud.td_name, name, namelen);
 	de->td_namelen = namelen;
 }
 
 /*
  * Allocates a new directory entry for the node node with a name of name.
  * The new directory entry is returned in *de.
  *
  * The link count of node is increased by one to reflect the new object
  * referencing it.
  *
  * Returns zero on success or an appropriate error code on failure.
  */
 int
 tmpfs_alloc_dirent(struct tmpfs_mount *tmp, struct tmpfs_node *node,
     const char *name, u_int len, struct tmpfs_dirent **de)
 {
 	struct tmpfs_dirent *nde;
 
 	nde = uma_zalloc(tmpfs_dirent_pool, M_WAITOK);
 	nde->td_node = node;
 	if (name != NULL) {
 		nde->ud.td_name = malloc(len, M_TMPFSNAME, M_WAITOK);
 		tmpfs_dirent_init(nde, name, len);
 	} else
 		nde->td_namelen = 0;
 	if (node != NULL)
 		node->tn_links++;
 
 	*de = nde;
 
 	return 0;
 }
 
 /*
  * Frees a directory entry.  It is the caller's responsibility to destroy
  * the node referenced by it if needed.
  *
  * The link count of node is decreased by one to reflect the removal of an
  * object that referenced it.  This only happens if 'node_exists' is true;
  * otherwise the function will not access the node referred to by the
  * directory entry, as it may already have been released from the outside.
  */
 void
 tmpfs_free_dirent(struct tmpfs_mount *tmp, struct tmpfs_dirent *de)
 {
 	struct tmpfs_node *node;
 
 	node = de->td_node;
 	if (node != NULL) {
 		MPASS(node->tn_links > 0);
 		node->tn_links--;
 	}
 	if (!tmpfs_dirent_duphead(de) && de->ud.td_name != NULL)
 		free(de->ud.td_name, M_TMPFSNAME);
 	uma_zfree(tmpfs_dirent_pool, de);
 }
 
 void
 tmpfs_destroy_vobject(struct vnode *vp, vm_object_t obj)
 {
 
 	ASSERT_VOP_ELOCKED(vp, "tmpfs_destroy_vobject");
 	if (vp->v_type != VREG || obj == NULL)
 		return;
 
 	VM_OBJECT_WLOCK(obj);
 	VI_LOCK(vp);
 	vm_object_clear_flag(obj, OBJ_TMPFS);
 	obj->un_pager.swp.swp_tmpfs = NULL;
 	if (vp->v_writecount < 0)
 		vp->v_writecount = 0;
 	VI_UNLOCK(vp);
 	VM_OBJECT_WUNLOCK(obj);
 }
 
 /*
  * Need to clear v_object for insmntque failure.
  */
 static void
 tmpfs_insmntque_dtr(struct vnode *vp, void *dtr_arg)
 {
 
 	tmpfs_destroy_vobject(vp, vp->v_object);
 	vp->v_object = NULL;
 	vp->v_data = NULL;
 	vp->v_op = &dead_vnodeops;
 	vgone(vp);
 	vput(vp);
 }
 
 /*
  * Allocates a new vnode for the node node or returns a new reference to
  * an existing one if the node had already a vnode referencing it.  The
  * resulting locked vnode is returned in *vpp.
  *
  * Returns zero on success or an appropriate error code on failure.
  */
 int
 tmpfs_alloc_vp(struct mount *mp, struct tmpfs_node *node, int lkflag,
     struct vnode **vpp)
 {
 	struct vnode *vp;
 	struct tmpfs_mount *tm;
 	vm_object_t object;
 	int error;
 
 	error = 0;
 	tm = VFS_TO_TMPFS(mp);
 	TMPFS_NODE_LOCK(node);
 	tmpfs_ref_node_locked(node);
 loop:
 	TMPFS_NODE_ASSERT_LOCKED(node);
 	if ((vp = node->tn_vnode) != NULL) {
 		MPASS((node->tn_vpstate & TMPFS_VNODE_DOOMED) == 0);
 		VI_LOCK(vp);
 		if ((node->tn_type == VDIR && node->tn_dir.tn_parent == NULL) ||
 		    (VN_IS_DOOMED(vp) &&
 		     (lkflag & LK_NOWAIT) != 0)) {
 			VI_UNLOCK(vp);
 			TMPFS_NODE_UNLOCK(node);
 			error = ENOENT;
 			vp = NULL;
 			goto out;
 		}
 		if (VN_IS_DOOMED(vp)) {
 			VI_UNLOCK(vp);
 			node->tn_vpstate |= TMPFS_VNODE_WRECLAIM;
 			while ((node->tn_vpstate & TMPFS_VNODE_WRECLAIM) != 0) {
 				msleep(&node->tn_vnode, TMPFS_NODE_MTX(node),
 				    0, "tmpfsE", 0);
 			}
 			goto loop;
 		}
 		TMPFS_NODE_UNLOCK(node);
 		error = vget(vp, lkflag | LK_INTERLOCK, curthread);
 		if (error == ENOENT) {
 			TMPFS_NODE_LOCK(node);
 			goto loop;
 		}
 		if (error != 0) {
 			vp = NULL;
 			goto out;
 		}
 
 		/*
 		 * Make sure the vnode is still there after
 		 * getting the interlock to avoid racing a free.
 		 */
 		if (node->tn_vnode == NULL || node->tn_vnode != vp) {
 			vput(vp);
 			TMPFS_NODE_LOCK(node);
 			goto loop;
 		}
 
 		goto out;
 	}
 
 	if ((node->tn_vpstate & TMPFS_VNODE_DOOMED) ||
 	    (node->tn_type == VDIR && node->tn_dir.tn_parent == NULL)) {
 		TMPFS_NODE_UNLOCK(node);
 		error = ENOENT;
 		vp = NULL;
 		goto out;
 	}
 
 	/*
 	 * otherwise lock the vp list while we call getnewvnode
 	 * since that can block.
 	 */
 	if (node->tn_vpstate & TMPFS_VNODE_ALLOCATING) {
 		node->tn_vpstate |= TMPFS_VNODE_WANT;
 		error = msleep((caddr_t) &node->tn_vpstate,
 		    TMPFS_NODE_MTX(node), 0, "tmpfs_alloc_vp", 0);
 		if (error != 0)
 			goto out;
 		goto loop;
 	} else
 		node->tn_vpstate |= TMPFS_VNODE_ALLOCATING;
 	
 	TMPFS_NODE_UNLOCK(node);
 
 	/* Get a new vnode and associate it with our node. */
 	error = getnewvnode("tmpfs", mp, VFS_TO_TMPFS(mp)->tm_nonc ?
 	    &tmpfs_vnodeop_nonc_entries : &tmpfs_vnodeop_entries, &vp);
 	if (error != 0)
 		goto unlock;
 	MPASS(vp != NULL);
 
 	/* lkflag is ignored, the lock is exclusive */
 	(void) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 
 	vp->v_data = node;
 	vp->v_type = node->tn_type;
 
 	/* Type-specific initialization. */
 	switch (node->tn_type) {
 	case VBLK:
 		/* FALLTHROUGH */
 	case VCHR:
 		/* FALLTHROUGH */
 	case VLNK:
 		/* FALLTHROUGH */
 	case VSOCK:
 		break;
 	case VFIFO:
 		vp->v_op = &tmpfs_fifoop_entries;
 		break;
 	case VREG:
 		object = node->tn_reg.tn_aobj;
 		VM_OBJECT_WLOCK(object);
 		VI_LOCK(vp);
 		KASSERT(vp->v_object == NULL, ("Not NULL v_object in tmpfs"));
 		vp->v_object = object;
 		object->un_pager.swp.swp_tmpfs = vp;
 		vm_object_set_flag(object, OBJ_TMPFS);
 		VI_UNLOCK(vp);
 		VM_OBJECT_WUNLOCK(object);
 		break;
 	case VDIR:
 		MPASS(node->tn_dir.tn_parent != NULL);
 		if (node->tn_dir.tn_parent == node)
 			vp->v_vflag |= VV_ROOT;
 		break;
 
 	default:
 		panic("tmpfs_alloc_vp: type %p %d", node, (int)node->tn_type);
 	}
 	if (vp->v_type != VFIFO)
 		VN_LOCK_ASHARE(vp);
 
 	error = insmntque1(vp, mp, tmpfs_insmntque_dtr, NULL);
 	if (error != 0)
 		vp = NULL;
 
 unlock:
 	TMPFS_NODE_LOCK(node);
 
 	MPASS(node->tn_vpstate & TMPFS_VNODE_ALLOCATING);
 	node->tn_vpstate &= ~TMPFS_VNODE_ALLOCATING;
 	node->tn_vnode = vp;
 
 	if (node->tn_vpstate & TMPFS_VNODE_WANT) {
 		node->tn_vpstate &= ~TMPFS_VNODE_WANT;
 		TMPFS_NODE_UNLOCK(node);
 		wakeup((caddr_t) &node->tn_vpstate);
 	} else
 		TMPFS_NODE_UNLOCK(node);
 
 out:
 	if (error == 0) {
 		*vpp = vp;
 
 #ifdef INVARIANTS
 		MPASS(*vpp != NULL && VOP_ISLOCKED(*vpp));
 		TMPFS_NODE_LOCK(node);
 		MPASS(*vpp == node->tn_vnode);
 		TMPFS_NODE_UNLOCK(node);
 #endif
 	}
 	tmpfs_free_node(tm, node);
 
 	return (error);
 }
 
 /*
  * Destroys the association between the vnode vp and the node it
  * references.
  */
 void
 tmpfs_free_vp(struct vnode *vp)
 {
 	struct tmpfs_node *node;
 
 	node = VP_TO_TMPFS_NODE(vp);
 
 	TMPFS_NODE_ASSERT_LOCKED(node);
 	node->tn_vnode = NULL;
 	if ((node->tn_vpstate & TMPFS_VNODE_WRECLAIM) != 0)
 		wakeup(&node->tn_vnode);
 	node->tn_vpstate &= ~TMPFS_VNODE_WRECLAIM;
 	vp->v_data = NULL;
 }
 
 /*
  * Allocates a new file of type 'type' and adds it to the parent directory
  * 'dvp'; this addition is done using the component name given in 'cnp'.
  * The ownership of the new file is automatically assigned based on the
  * credentials of the caller (through 'cnp'), the group is set based on
  * the parent directory and the mode is determined from the 'vap' argument.
  * If successful, *vpp holds a vnode to the newly created file and zero
  * is returned.  Otherwise *vpp is NULL and the function returns an
  * appropriate error code.
  */
 int
 tmpfs_alloc_file(struct vnode *dvp, struct vnode **vpp, struct vattr *vap,
     struct componentname *cnp, const char *target)
 {
 	int error;
 	struct tmpfs_dirent *de;
 	struct tmpfs_mount *tmp;
 	struct tmpfs_node *dnode;
 	struct tmpfs_node *node;
 	struct tmpfs_node *parent;
 
 	ASSERT_VOP_ELOCKED(dvp, "tmpfs_alloc_file");
 	MPASS(cnp->cn_flags & HASBUF);
 
 	tmp = VFS_TO_TMPFS(dvp->v_mount);
 	dnode = VP_TO_TMPFS_DIR(dvp);
 	*vpp = NULL;
 
 	/* If the entry we are creating is a directory, we cannot overflow
 	 * the number of links of its parent, because it will get a new
 	 * link. */
 	if (vap->va_type == VDIR) {
 		/* Ensure that we do not overflow the maximum number of links
 		 * imposed by the system. */
 		MPASS(dnode->tn_links <= TMPFS_LINK_MAX);
 		if (dnode->tn_links == TMPFS_LINK_MAX) {
 			return (EMLINK);
 		}
 
 		parent = dnode;
 		MPASS(parent != NULL);
 	} else
 		parent = NULL;
 
 	/* Allocate a node that represents the new file. */
 	error = tmpfs_alloc_node(dvp->v_mount, tmp, vap->va_type,
 	    cnp->cn_cred->cr_uid, dnode->tn_gid, vap->va_mode, parent,
 	    target, vap->va_rdev, &node);
 	if (error != 0)
 		return (error);
 
 	/* Allocate a directory entry that points to the new file. */
 	error = tmpfs_alloc_dirent(tmp, node, cnp->cn_nameptr, cnp->cn_namelen,
 	    &de);
 	if (error != 0) {
 		tmpfs_free_node(tmp, node);
 		return (error);
 	}
 
 	/* Allocate a vnode for the new file. */
 	error = tmpfs_alloc_vp(dvp->v_mount, node, LK_EXCLUSIVE, vpp);
 	if (error != 0) {
 		tmpfs_free_dirent(tmp, de);
 		tmpfs_free_node(tmp, node);
 		return (error);
 	}
 
 	/* Now that all required items are allocated, we can proceed to
 	 * insert the new node into the directory, an operation that
 	 * cannot fail. */
 	if (cnp->cn_flags & ISWHITEOUT)
 		tmpfs_dir_whiteout_remove(dvp, cnp);
 	tmpfs_dir_attach(dvp, de);
 	return (0);
 }
 
 struct tmpfs_dirent *
 tmpfs_dir_first(struct tmpfs_node *dnode, struct tmpfs_dir_cursor *dc)
 {
 	struct tmpfs_dirent *de;
 
 	de = RB_MIN(tmpfs_dir, &dnode->tn_dir.tn_dirhead);
 	dc->tdc_tree = de;
 	if (de != NULL && tmpfs_dirent_duphead(de))
 		de = LIST_FIRST(&de->ud.td_duphead);
 	dc->tdc_current = de;
 
 	return (dc->tdc_current);
 }
 
 struct tmpfs_dirent *
 tmpfs_dir_next(struct tmpfs_node *dnode, struct tmpfs_dir_cursor *dc)
 {
 	struct tmpfs_dirent *de;
 
 	MPASS(dc->tdc_tree != NULL);
 	if (tmpfs_dirent_dup(dc->tdc_current)) {
 		dc->tdc_current = LIST_NEXT(dc->tdc_current, uh.td_dup.entries);
 		if (dc->tdc_current != NULL)
 			return (dc->tdc_current);
 	}
 	dc->tdc_tree = dc->tdc_current = RB_NEXT(tmpfs_dir,
 	    &dnode->tn_dir.tn_dirhead, dc->tdc_tree);
 	if ((de = dc->tdc_current) != NULL && tmpfs_dirent_duphead(de)) {
 		dc->tdc_current = LIST_FIRST(&de->ud.td_duphead);
 		MPASS(dc->tdc_current != NULL);
 	}
 
 	return (dc->tdc_current);
 }
 
 /* Lookup directory entry in RB-Tree. Function may return duphead entry. */
 static struct tmpfs_dirent *
 tmpfs_dir_xlookup_hash(struct tmpfs_node *dnode, uint32_t hash)
 {
 	struct tmpfs_dirent *de, dekey;
 
 	dekey.td_hash = hash;
 	de = RB_FIND(tmpfs_dir, &dnode->tn_dir.tn_dirhead, &dekey);
 	return (de);
 }
 
 /* Lookup directory entry by cookie, initialize directory cursor accordingly. */
 static struct tmpfs_dirent *
 tmpfs_dir_lookup_cookie(struct tmpfs_node *node, off_t cookie,
     struct tmpfs_dir_cursor *dc)
 {
 	struct tmpfs_dir *dirhead = &node->tn_dir.tn_dirhead;
 	struct tmpfs_dirent *de, dekey;
 
 	MPASS(cookie >= TMPFS_DIRCOOKIE_MIN);
 
 	if (cookie == node->tn_dir.tn_readdir_lastn &&
 	    (de = node->tn_dir.tn_readdir_lastp) != NULL) {
 		/* Protect against possible race, tn_readdir_last[pn]
 		 * may be updated with only shared vnode lock held. */
 		if (cookie == tmpfs_dirent_cookie(de))
 			goto out;
 	}
 
 	if ((cookie & TMPFS_DIRCOOKIE_DUP) != 0) {
 		LIST_FOREACH(de, &node->tn_dir.tn_dupindex,
 		    uh.td_dup.index_entries) {
 			MPASS(tmpfs_dirent_dup(de));
 			if (de->td_cookie == cookie)
 				goto out;
 			/* dupindex list is sorted. */
 			if (de->td_cookie < cookie) {
 				de = NULL;
 				goto out;
 			}
 		}
 		MPASS(de == NULL);
 		goto out;
 	}
 
 	if ((cookie & TMPFS_DIRCOOKIE_MASK) != cookie) {
 		de = NULL;
 	} else {
 		dekey.td_hash = cookie;
 		/* Recover if direntry for cookie was removed */
 		de = RB_NFIND(tmpfs_dir, dirhead, &dekey);
 	}
 	dc->tdc_tree = de;
 	dc->tdc_current = de;
 	if (de != NULL && tmpfs_dirent_duphead(de)) {
 		dc->tdc_current = LIST_FIRST(&de->ud.td_duphead);
 		MPASS(dc->tdc_current != NULL);
 	}
 	return (dc->tdc_current);
 
 out:
 	dc->tdc_tree = de;
 	dc->tdc_current = de;
 	if (de != NULL && tmpfs_dirent_dup(de))
 		dc->tdc_tree = tmpfs_dir_xlookup_hash(node,
 		    de->td_hash);
 	return (dc->tdc_current);
 }
 
 /*
  * Looks for a directory entry in the directory represented by node.
  * 'cnp' describes the name of the entry to look for.  Note that the .
  * and .. components are not allowed as they do not physically exist
  * within directories.
  *
  * Returns a pointer to the entry when found, otherwise NULL.
  */
 struct tmpfs_dirent *
 tmpfs_dir_lookup(struct tmpfs_node *node, struct tmpfs_node *f,
     struct componentname *cnp)
 {
 	struct tmpfs_dir_duphead *duphead;
 	struct tmpfs_dirent *de;
 	uint32_t hash;
 
 	MPASS(IMPLIES(cnp->cn_namelen == 1, cnp->cn_nameptr[0] != '.'));
 	MPASS(IMPLIES(cnp->cn_namelen == 2, !(cnp->cn_nameptr[0] == '.' &&
 	    cnp->cn_nameptr[1] == '.')));
 	TMPFS_VALIDATE_DIR(node);
 
 	hash = tmpfs_dirent_hash(cnp->cn_nameptr, cnp->cn_namelen);
 	de = tmpfs_dir_xlookup_hash(node, hash);
 	if (de != NULL && tmpfs_dirent_duphead(de)) {
 		duphead = &de->ud.td_duphead;
 		LIST_FOREACH(de, duphead, uh.td_dup.entries) {
 			if (TMPFS_DIRENT_MATCHES(de, cnp->cn_nameptr,
 			    cnp->cn_namelen))
 				break;
 		}
 	} else if (de != NULL) {
 		if (!TMPFS_DIRENT_MATCHES(de, cnp->cn_nameptr,
 		    cnp->cn_namelen))
 			de = NULL;
 	}
 	if (de != NULL && f != NULL && de->td_node != f)
 		de = NULL;
 
 	return (de);
 }
 
 /*
  * Attach duplicate-cookie directory entry nde to dnode and insert to dupindex
  * list, allocate new cookie value.
  */
 static void
 tmpfs_dir_attach_dup(struct tmpfs_node *dnode,
     struct tmpfs_dir_duphead *duphead, struct tmpfs_dirent *nde)
 {
 	struct tmpfs_dir_duphead *dupindex;
 	struct tmpfs_dirent *de, *pde;
 
 	dupindex = &dnode->tn_dir.tn_dupindex;
 	de = LIST_FIRST(dupindex);
 	if (de == NULL || de->td_cookie < TMPFS_DIRCOOKIE_DUP_MAX) {
 		if (de == NULL)
 			nde->td_cookie = TMPFS_DIRCOOKIE_DUP_MIN;
 		else
 			nde->td_cookie = de->td_cookie + 1;
 		MPASS(tmpfs_dirent_dup(nde));
 		LIST_INSERT_HEAD(dupindex, nde, uh.td_dup.index_entries);
 		LIST_INSERT_HEAD(duphead, nde, uh.td_dup.entries);
 		return;
 	}
 
 	/*
 	 * Cookie numbers are near exhaustion. Scan dupindex list for unused
 	 * numbers. dupindex list is sorted in descending order. Keep it so
 	 * after inserting nde.
 	 */
 	while (1) {
 		pde = de;
 		de = LIST_NEXT(de, uh.td_dup.index_entries);
 		if (de == NULL && pde->td_cookie != TMPFS_DIRCOOKIE_DUP_MIN) {
 			/*
 			 * Last element of the index doesn't have minimal cookie
 			 * value, use it.
 			 */
 			nde->td_cookie = TMPFS_DIRCOOKIE_DUP_MIN;
 			LIST_INSERT_AFTER(pde, nde, uh.td_dup.index_entries);
 			LIST_INSERT_HEAD(duphead, nde, uh.td_dup.entries);
 			return;
 		} else if (de == NULL) {
 			/*
 			 * We are so lucky have 2^30 hash duplicates in single
 			 * directory :) Return largest possible cookie value.
 			 * It should be fine except possible issues with
 			 * VOP_READDIR restart.
 			 */
 			nde->td_cookie = TMPFS_DIRCOOKIE_DUP_MAX;
 			LIST_INSERT_HEAD(dupindex, nde,
 			    uh.td_dup.index_entries);
 			LIST_INSERT_HEAD(duphead, nde, uh.td_dup.entries);
 			return;
 		}
 		if (de->td_cookie + 1 == pde->td_cookie ||
 		    de->td_cookie >= TMPFS_DIRCOOKIE_DUP_MAX)
 			continue;	/* No hole or invalid cookie. */
 		nde->td_cookie = de->td_cookie + 1;
 		MPASS(tmpfs_dirent_dup(nde));
 		MPASS(pde->td_cookie > nde->td_cookie);
 		MPASS(nde->td_cookie > de->td_cookie);
 		LIST_INSERT_BEFORE(de, nde, uh.td_dup.index_entries);
 		LIST_INSERT_HEAD(duphead, nde, uh.td_dup.entries);
 		return;
 	}
 }
 
 /*
  * Attaches the directory entry de to the directory represented by vp.
  * Note that this does not change the link count of the node pointed by
  * the directory entry, as this is done by tmpfs_alloc_dirent.
  */
 void
 tmpfs_dir_attach(struct vnode *vp, struct tmpfs_dirent *de)
 {
 	struct tmpfs_node *dnode;
 	struct tmpfs_dirent *xde, *nde;
 
 	ASSERT_VOP_ELOCKED(vp, __func__);
 	MPASS(de->td_namelen > 0);
 	MPASS(de->td_hash >= TMPFS_DIRCOOKIE_MIN);
 	MPASS(de->td_cookie == de->td_hash);
 
 	dnode = VP_TO_TMPFS_DIR(vp);
 	dnode->tn_dir.tn_readdir_lastn = 0;
 	dnode->tn_dir.tn_readdir_lastp = NULL;
 
 	MPASS(!tmpfs_dirent_dup(de));
 	xde = RB_INSERT(tmpfs_dir, &dnode->tn_dir.tn_dirhead, de);
 	if (xde != NULL && tmpfs_dirent_duphead(xde))
 		tmpfs_dir_attach_dup(dnode, &xde->ud.td_duphead, de);
 	else if (xde != NULL) {
 		/*
 		 * Allocate new duphead. Swap xde with duphead to avoid
 		 * adding/removing elements with the same hash.
 		 */
 		MPASS(!tmpfs_dirent_dup(xde));
 		tmpfs_alloc_dirent(VFS_TO_TMPFS(vp->v_mount), NULL, NULL, 0,
 		    &nde);
 		/* *nde = *xde; XXX gcc 4.2.1 may generate invalid code. */
 		memcpy(nde, xde, sizeof(*xde));
 		xde->td_cookie |= TMPFS_DIRCOOKIE_DUPHEAD;
 		LIST_INIT(&xde->ud.td_duphead);
 		xde->td_namelen = 0;
 		xde->td_node = NULL;
 		tmpfs_dir_attach_dup(dnode, &xde->ud.td_duphead, nde);
 		tmpfs_dir_attach_dup(dnode, &xde->ud.td_duphead, de);
 	}
 	dnode->tn_size += sizeof(struct tmpfs_dirent);
 	dnode->tn_status |= TMPFS_NODE_ACCESSED | TMPFS_NODE_CHANGED | \
 	    TMPFS_NODE_MODIFIED;
 	tmpfs_update(vp);
 }
 
 /*
  * Detaches the directory entry de from the directory represented by vp.
  * Note that this does not change the link count of the node pointed by
  * the directory entry, as this is done by tmpfs_free_dirent.
  */
 void
 tmpfs_dir_detach(struct vnode *vp, struct tmpfs_dirent *de)
 {
 	struct tmpfs_mount *tmp;
 	struct tmpfs_dir *head;
 	struct tmpfs_node *dnode;
 	struct tmpfs_dirent *xde;
 
 	ASSERT_VOP_ELOCKED(vp, __func__);
 
 	dnode = VP_TO_TMPFS_DIR(vp);
 	head = &dnode->tn_dir.tn_dirhead;
 	dnode->tn_dir.tn_readdir_lastn = 0;
 	dnode->tn_dir.tn_readdir_lastp = NULL;
 
 	if (tmpfs_dirent_dup(de)) {
 		/* Remove duphead if de was last entry. */
 		if (LIST_NEXT(de, uh.td_dup.entries) == NULL) {
 			xde = tmpfs_dir_xlookup_hash(dnode, de->td_hash);
 			MPASS(tmpfs_dirent_duphead(xde));
 		} else
 			xde = NULL;
 		LIST_REMOVE(de, uh.td_dup.entries);
 		LIST_REMOVE(de, uh.td_dup.index_entries);
 		if (xde != NULL) {
 			if (LIST_EMPTY(&xde->ud.td_duphead)) {
 				RB_REMOVE(tmpfs_dir, head, xde);
 				tmp = VFS_TO_TMPFS(vp->v_mount);
 				MPASS(xde->td_node == NULL);
 				tmpfs_free_dirent(tmp, xde);
 			}
 		}
 		de->td_cookie = de->td_hash;
 	} else
 		RB_REMOVE(tmpfs_dir, head, de);
 
 	dnode->tn_size -= sizeof(struct tmpfs_dirent);
 	dnode->tn_status |= TMPFS_NODE_ACCESSED | TMPFS_NODE_CHANGED | \
 	    TMPFS_NODE_MODIFIED;
 	tmpfs_update(vp);
 }
 
 void
 tmpfs_dir_destroy(struct tmpfs_mount *tmp, struct tmpfs_node *dnode)
 {
 	struct tmpfs_dirent *de, *dde, *nde;
 
 	RB_FOREACH_SAFE(de, tmpfs_dir, &dnode->tn_dir.tn_dirhead, nde) {
 		RB_REMOVE(tmpfs_dir, &dnode->tn_dir.tn_dirhead, de);
 		/* Node may already be destroyed. */
 		de->td_node = NULL;
 		if (tmpfs_dirent_duphead(de)) {
 			while ((dde = LIST_FIRST(&de->ud.td_duphead)) != NULL) {
 				LIST_REMOVE(dde, uh.td_dup.entries);
 				dde->td_node = NULL;
 				tmpfs_free_dirent(tmp, dde);
 			}
 		}
 		tmpfs_free_dirent(tmp, de);
 	}
 }
 
 /*
  * Helper function for tmpfs_readdir.  Creates a '.' entry for the given
  * directory and returns it in the uio space.  The function returns 0
  * on success, -1 if there was not enough space in the uio structure to
  * hold the directory entry or an appropriate error code if another
  * error happens.
  */
 static int
 tmpfs_dir_getdotdent(struct tmpfs_mount *tm, struct tmpfs_node *node,
     struct uio *uio)
 {
 	int error;
 	struct dirent dent;
 
 	TMPFS_VALIDATE_DIR(node);
 	MPASS(uio->uio_offset == TMPFS_DIRCOOKIE_DOT);
 
 	dent.d_fileno = node->tn_id;
 	dent.d_type = DT_DIR;
 	dent.d_namlen = 1;
 	dent.d_name[0] = '.';
 	dent.d_reclen = GENERIC_DIRSIZ(&dent);
 	dirent_terminate(&dent);
 
 	if (dent.d_reclen > uio->uio_resid)
 		error = EJUSTRETURN;
 	else
 		error = uiomove(&dent, dent.d_reclen, uio);
 
 	tmpfs_set_status(tm, node, TMPFS_NODE_ACCESSED);
 
 	return (error);
 }
 
 /*
  * Helper function for tmpfs_readdir.  Creates a '..' entry for the given
  * directory and returns it in the uio space.  The function returns 0
  * on success, -1 if there was not enough space in the uio structure to
  * hold the directory entry or an appropriate error code if another
  * error happens.
  */
 static int
 tmpfs_dir_getdotdotdent(struct tmpfs_mount *tm, struct tmpfs_node *node,
     struct uio *uio)
 {
 	struct tmpfs_node *parent;
 	struct dirent dent;
 	int error;
 
 	TMPFS_VALIDATE_DIR(node);
 	MPASS(uio->uio_offset == TMPFS_DIRCOOKIE_DOTDOT);
 
 	/*
 	 * Return ENOENT if the current node is already removed.
 	 */
 	TMPFS_ASSERT_LOCKED(node);
 	parent = node->tn_dir.tn_parent;
 	if (parent == NULL)
 		return (ENOENT);
 
 	TMPFS_NODE_LOCK(parent);
 	dent.d_fileno = parent->tn_id;
 	TMPFS_NODE_UNLOCK(parent);
 
 	dent.d_type = DT_DIR;
 	dent.d_namlen = 2;
 	dent.d_name[0] = '.';
 	dent.d_name[1] = '.';
 	dent.d_reclen = GENERIC_DIRSIZ(&dent);
 	dirent_terminate(&dent);
 
 	if (dent.d_reclen > uio->uio_resid)
 		error = EJUSTRETURN;
 	else
 		error = uiomove(&dent, dent.d_reclen, uio);
 
 	tmpfs_set_status(tm, node, TMPFS_NODE_ACCESSED);
 
 	return (error);
 }
 
 /*
  * Helper function for tmpfs_readdir.  Returns as much directory entries
  * as can fit in the uio space.  The read starts at uio->uio_offset.
  * The function returns 0 on success, -1 if there was not enough space
  * in the uio structure to hold the directory entry or an appropriate
  * error code if another error happens.
  */
 int
 tmpfs_dir_getdents(struct tmpfs_mount *tm, struct tmpfs_node *node,
     struct uio *uio, int maxcookies, u_long *cookies, int *ncookies)
 {
 	struct tmpfs_dir_cursor dc;
 	struct tmpfs_dirent *de;
 	off_t off;
 	int error;
 
 	TMPFS_VALIDATE_DIR(node);
 
 	off = 0;
 
 	/*
 	 * Lookup the node from the current offset.  The starting offset of
 	 * 0 will lookup both '.' and '..', and then the first real entry,
 	 * or EOF if there are none.  Then find all entries for the dir that
 	 * fit into the buffer.  Once no more entries are found (de == NULL),
 	 * the offset is set to TMPFS_DIRCOOKIE_EOF, which will cause the next
 	 * call to return 0.
 	 */
 	switch (uio->uio_offset) {
 	case TMPFS_DIRCOOKIE_DOT:
 		error = tmpfs_dir_getdotdent(tm, node, uio);
 		if (error != 0)
 			return (error);
 		uio->uio_offset = TMPFS_DIRCOOKIE_DOTDOT;
 		if (cookies != NULL)
 			cookies[(*ncookies)++] = off = uio->uio_offset;
 		/* FALLTHROUGH */
 	case TMPFS_DIRCOOKIE_DOTDOT:
 		error = tmpfs_dir_getdotdotdent(tm, node, uio);
 		if (error != 0)
 			return (error);
 		de = tmpfs_dir_first(node, &dc);
 		uio->uio_offset = tmpfs_dirent_cookie(de);
 		if (cookies != NULL)
 			cookies[(*ncookies)++] = off = uio->uio_offset;
 		/* EOF. */
 		if (de == NULL)
 			return (0);
 		break;
 	case TMPFS_DIRCOOKIE_EOF:
 		return (0);
 	default:
 		de = tmpfs_dir_lookup_cookie(node, uio->uio_offset, &dc);
 		if (de == NULL)
 			return (EINVAL);
 		if (cookies != NULL)
 			off = tmpfs_dirent_cookie(de);
 	}
 
 	/* Read as much entries as possible; i.e., until we reach the end of
 	 * the directory or we exhaust uio space. */
 	do {
 		struct dirent d;
 
 		/* Create a dirent structure representing the current
 		 * tmpfs_node and fill it. */
 		if (de->td_node == NULL) {
 			d.d_fileno = 1;
 			d.d_type = DT_WHT;
 		} else {
 			d.d_fileno = de->td_node->tn_id;
 			switch (de->td_node->tn_type) {
 			case VBLK:
 				d.d_type = DT_BLK;
 				break;
 
 			case VCHR:
 				d.d_type = DT_CHR;
 				break;
 
 			case VDIR:
 				d.d_type = DT_DIR;
 				break;
 
 			case VFIFO:
 				d.d_type = DT_FIFO;
 				break;
 
 			case VLNK:
 				d.d_type = DT_LNK;
 				break;
 
 			case VREG:
 				d.d_type = DT_REG;
 				break;
 
 			case VSOCK:
 				d.d_type = DT_SOCK;
 				break;
 
 			default:
 				panic("tmpfs_dir_getdents: type %p %d",
 				    de->td_node, (int)de->td_node->tn_type);
 			}
 		}
 		d.d_namlen = de->td_namelen;
 		MPASS(de->td_namelen < sizeof(d.d_name));
 		(void)memcpy(d.d_name, de->ud.td_name, de->td_namelen);
 		d.d_reclen = GENERIC_DIRSIZ(&d);
 		dirent_terminate(&d);
 
 		/* Stop reading if the directory entry we are treating is
 		 * bigger than the amount of data that can be returned. */
 		if (d.d_reclen > uio->uio_resid) {
 			error = EJUSTRETURN;
 			break;
 		}
 
 		/* Copy the new dirent structure into the output buffer and
 		 * advance pointers. */
 		error = uiomove(&d, d.d_reclen, uio);
 		if (error == 0) {
 			de = tmpfs_dir_next(node, &dc);
 			if (cookies != NULL) {
 				off = tmpfs_dirent_cookie(de);
 				MPASS(*ncookies < maxcookies);
 				cookies[(*ncookies)++] = off;
 			}
 		}
 	} while (error == 0 && uio->uio_resid > 0 && de != NULL);
 
 	/* Skip setting off when using cookies as it is already done above. */
 	if (cookies == NULL)
 		off = tmpfs_dirent_cookie(de);
 
 	/* Update the offset and cache. */
 	uio->uio_offset = off;
 	node->tn_dir.tn_readdir_lastn = off;
 	node->tn_dir.tn_readdir_lastp = de;
 
 	tmpfs_set_status(tm, node, TMPFS_NODE_ACCESSED);
 	return error;
 }
 
 int
 tmpfs_dir_whiteout_add(struct vnode *dvp, struct componentname *cnp)
 {
 	struct tmpfs_dirent *de;
 	int error;
 
 	error = tmpfs_alloc_dirent(VFS_TO_TMPFS(dvp->v_mount), NULL,
 	    cnp->cn_nameptr, cnp->cn_namelen, &de);
 	if (error != 0)
 		return (error);
 	tmpfs_dir_attach(dvp, de);
 	return (0);
 }
 
 void
 tmpfs_dir_whiteout_remove(struct vnode *dvp, struct componentname *cnp)
 {
 	struct tmpfs_dirent *de;
 
 	de = tmpfs_dir_lookup(VP_TO_TMPFS_DIR(dvp), NULL, cnp);
 	MPASS(de != NULL && de->td_node == NULL);
 	tmpfs_dir_detach(dvp, de);
 	tmpfs_free_dirent(VFS_TO_TMPFS(dvp->v_mount), de);
 }
 
 /*
  * Resizes the aobj associated with the regular file pointed to by 'vp' to the
  * size 'newsize'.  'vp' must point to a vnode that represents a regular file.
  * 'newsize' must be positive.
  *
  * Returns zero on success or an appropriate error code on failure.
  */
 int
 tmpfs_reg_resize(struct vnode *vp, off_t newsize, boolean_t ignerr)
 {
 	struct tmpfs_mount *tmp;
 	struct tmpfs_node *node;
 	vm_object_t uobj;
 	vm_page_t m;
 	vm_pindex_t idx, newpages, oldpages;
 	off_t oldsize;
 	int base, rv;
 
 	MPASS(vp->v_type == VREG);
 	MPASS(newsize >= 0);
 
 	node = VP_TO_TMPFS_NODE(vp);
 	uobj = node->tn_reg.tn_aobj;
 	tmp = VFS_TO_TMPFS(vp->v_mount);
 
 	/*
 	 * Convert the old and new sizes to the number of pages needed to
 	 * store them.  It may happen that we do not need to do anything
 	 * because the last allocated page can accommodate the change on
 	 * its own.
 	 */
 	oldsize = node->tn_size;
 	oldpages = OFF_TO_IDX(oldsize + PAGE_MASK);
 	MPASS(oldpages == uobj->size);
 	newpages = OFF_TO_IDX(newsize + PAGE_MASK);
 
 	if (__predict_true(newpages == oldpages && newsize >= oldsize)) {
 		node->tn_size = newsize;
 		return (0);
 	}
 
 	if (newpages > oldpages &&
 	    tmpfs_pages_check_avail(tmp, newpages - oldpages) == 0)
 		return (ENOSPC);
 
 	VM_OBJECT_WLOCK(uobj);
 	if (newsize < oldsize) {
 		/*
 		 * Zero the truncated part of the last page.
 		 */
 		base = newsize & PAGE_MASK;
 		if (base != 0) {
 			idx = OFF_TO_IDX(newsize);
 retry:
 			m = vm_page_grab(uobj, idx, VM_ALLOC_NOCREAT);
 			if (m != NULL) {
 				MPASS(vm_page_all_valid(m));
 			} else if (vm_pager_has_page(uobj, idx, NULL, NULL)) {
 				m = vm_page_alloc(uobj, idx, VM_ALLOC_NORMAL |
 				    VM_ALLOC_WAITFAIL);
 				if (m == NULL)
 					goto retry;
 				vm_object_pip_add(uobj, 1);
 				VM_OBJECT_WUNLOCK(uobj);
 				rv = vm_pager_get_pages(uobj, &m, 1, NULL,
 				    NULL);
 				VM_OBJECT_WLOCK(uobj);
 				vm_object_pip_wakeup(uobj);
 				if (rv == VM_PAGER_OK) {
 					/*
 					 * Since the page was not resident,
 					 * and therefore not recently
 					 * accessed, immediately enqueue it
 					 * for asynchronous laundering.  The
 					 * current operation is not regarded
 					 * as an access.
 					 */
 					vm_page_launder(m);
 				} else {
 					vm_page_free(m);
 					if (ignerr)
 						m = NULL;
 					else {
 						VM_OBJECT_WUNLOCK(uobj);
 						return (EIO);
 					}
 				}
 			}
 			if (m != NULL) {
 				pmap_zero_page_area(m, base, PAGE_SIZE - base);
 				vm_page_set_dirty(m);
 				vm_page_xunbusy(m);
 			}
 		}
 
 		/*
 		 * Release any swap space and free any whole pages.
 		 */
-		if (newpages < oldpages) {
-			swap_pager_freespace(uobj, newpages, oldpages -
-			    newpages);
+		if (newpages < oldpages)
 			vm_object_page_remove(uobj, newpages, 0, 0);
-		}
 	}
 	uobj->size = newpages;
 	VM_OBJECT_WUNLOCK(uobj);
 
 	atomic_add_long(&tmp->tm_pages_used, newpages - oldpages);
 
 	node->tn_size = newsize;
 	return (0);
 }
 
 void
 tmpfs_check_mtime(struct vnode *vp)
 {
 	struct tmpfs_node *node;
 	struct vm_object *obj;
 
 	ASSERT_VOP_ELOCKED(vp, "check_mtime");
 	if (vp->v_type != VREG)
 		return;
 	obj = vp->v_object;
 	KASSERT((obj->flags & (OBJ_TMPFS_NODE | OBJ_TMPFS)) ==
 	    (OBJ_TMPFS_NODE | OBJ_TMPFS), ("non-tmpfs obj"));
 	/* unlocked read */
 	if (obj->generation != obj->cleangeneration) {
 		VM_OBJECT_WLOCK(obj);
 		if (obj->generation != obj->cleangeneration) {
 			obj->cleangeneration = obj->generation;
 			node = VP_TO_TMPFS_NODE(vp);
 			node->tn_status |= TMPFS_NODE_MODIFIED |
 			    TMPFS_NODE_CHANGED;
 		}
 		VM_OBJECT_WUNLOCK(obj);
 	}
 }
 
 /*
  * Change flags of the given vnode.
  * Caller should execute tmpfs_update on vp after a successful execution.
  * The vnode must be locked on entry and remain locked on exit.
  */
 int
 tmpfs_chflags(struct vnode *vp, u_long flags, struct ucred *cred,
     struct thread *p)
 {
 	int error;
 	struct tmpfs_node *node;
 
 	ASSERT_VOP_ELOCKED(vp, "chflags");
 
 	node = VP_TO_TMPFS_NODE(vp);
 
 	if ((flags & ~(SF_APPEND | SF_ARCHIVED | SF_IMMUTABLE | SF_NOUNLINK |
 	    UF_APPEND | UF_ARCHIVE | UF_HIDDEN | UF_IMMUTABLE | UF_NODUMP |
 	    UF_NOUNLINK | UF_OFFLINE | UF_OPAQUE | UF_READONLY | UF_REPARSE |
 	    UF_SPARSE | UF_SYSTEM)) != 0)
 		return (EOPNOTSUPP);
 
 	/* Disallow this operation if the file system is mounted read-only. */
 	if (vp->v_mount->mnt_flag & MNT_RDONLY)
 		return EROFS;
 
 	/*
 	 * Callers may only modify the file flags on objects they
 	 * have VADMIN rights for.
 	 */
 	if ((error = VOP_ACCESS(vp, VADMIN, cred, p)))
 		return (error);
 	/*
 	 * Unprivileged processes are not permitted to unset system
 	 * flags, or modify flags if any system flags are set.
 	 */
 	if (!priv_check_cred(cred, PRIV_VFS_SYSFLAGS)) {
 		if (node->tn_flags &
 		    (SF_NOUNLINK | SF_IMMUTABLE | SF_APPEND)) {
 			error = securelevel_gt(cred, 0);
 			if (error)
 				return (error);
 		}
 	} else {
 		if (node->tn_flags &
 		    (SF_NOUNLINK | SF_IMMUTABLE | SF_APPEND) ||
 		    ((flags ^ node->tn_flags) & SF_SETTABLE))
 			return (EPERM);
 	}
 	node->tn_flags = flags;
 	node->tn_status |= TMPFS_NODE_CHANGED;
 
 	ASSERT_VOP_ELOCKED(vp, "chflags2");
 
 	return (0);
 }
 
 /*
  * Change access mode on the given vnode.
  * Caller should execute tmpfs_update on vp after a successful execution.
  * The vnode must be locked on entry and remain locked on exit.
  */
 int
 tmpfs_chmod(struct vnode *vp, mode_t mode, struct ucred *cred, struct thread *p)
 {
 	int error;
 	struct tmpfs_node *node;
 
 	ASSERT_VOP_ELOCKED(vp, "chmod");
 
 	node = VP_TO_TMPFS_NODE(vp);
 
 	/* Disallow this operation if the file system is mounted read-only. */
 	if (vp->v_mount->mnt_flag & MNT_RDONLY)
 		return EROFS;
 
 	/* Immutable or append-only files cannot be modified, either. */
 	if (node->tn_flags & (IMMUTABLE | APPEND))
 		return EPERM;
 
 	/*
 	 * To modify the permissions on a file, must possess VADMIN
 	 * for that file.
 	 */
 	if ((error = VOP_ACCESS(vp, VADMIN, cred, p)))
 		return (error);
 
 	/*
 	 * Privileged processes may set the sticky bit on non-directories,
 	 * as well as set the setgid bit on a file with a group that the
 	 * process is not a member of.
 	 */
 	if (vp->v_type != VDIR && (mode & S_ISTXT)) {
 		if (priv_check_cred(cred, PRIV_VFS_STICKYFILE))
 			return (EFTYPE);
 	}
 	if (!groupmember(node->tn_gid, cred) && (mode & S_ISGID)) {
 		error = priv_check_cred(cred, PRIV_VFS_SETGID);
 		if (error)
 			return (error);
 	}
 
 
 	node->tn_mode &= ~ALLPERMS;
 	node->tn_mode |= mode & ALLPERMS;
 
 	node->tn_status |= TMPFS_NODE_CHANGED;
 
 	ASSERT_VOP_ELOCKED(vp, "chmod2");
 
 	return (0);
 }
 
 /*
  * Change ownership of the given vnode.  At least one of uid or gid must
  * be different than VNOVAL.  If one is set to that value, the attribute
  * is unchanged.
  * Caller should execute tmpfs_update on vp after a successful execution.
  * The vnode must be locked on entry and remain locked on exit.
  */
 int
 tmpfs_chown(struct vnode *vp, uid_t uid, gid_t gid, struct ucred *cred,
     struct thread *p)
 {
 	int error;
 	struct tmpfs_node *node;
 	uid_t ouid;
 	gid_t ogid;
 
 	ASSERT_VOP_ELOCKED(vp, "chown");
 
 	node = VP_TO_TMPFS_NODE(vp);
 
 	/* Assign default values if they are unknown. */
 	MPASS(uid != VNOVAL || gid != VNOVAL);
 	if (uid == VNOVAL)
 		uid = node->tn_uid;
 	if (gid == VNOVAL)
 		gid = node->tn_gid;
 	MPASS(uid != VNOVAL && gid != VNOVAL);
 
 	/* Disallow this operation if the file system is mounted read-only. */
 	if (vp->v_mount->mnt_flag & MNT_RDONLY)
 		return EROFS;
 
 	/* Immutable or append-only files cannot be modified, either. */
 	if (node->tn_flags & (IMMUTABLE | APPEND))
 		return EPERM;
 
 	/*
 	 * To modify the ownership of a file, must possess VADMIN for that
 	 * file.
 	 */
 	if ((error = VOP_ACCESS(vp, VADMIN, cred, p)))
 		return (error);
 
 	/*
 	 * To change the owner of a file, or change the group of a file to a
 	 * group of which we are not a member, the caller must have
 	 * privilege.
 	 */
 	if ((uid != node->tn_uid ||
 	    (gid != node->tn_gid && !groupmember(gid, cred))) &&
 	    (error = priv_check_cred(cred, PRIV_VFS_CHOWN)))
 		return (error);
 
 	ogid = node->tn_gid;
 	ouid = node->tn_uid;
 
 	node->tn_uid = uid;
 	node->tn_gid = gid;
 
 	node->tn_status |= TMPFS_NODE_CHANGED;
 
 	if ((node->tn_mode & (S_ISUID | S_ISGID)) && (ouid != uid || ogid != gid)) {
 		if (priv_check_cred(cred, PRIV_VFS_RETAINSUGID))
 			node->tn_mode &= ~(S_ISUID | S_ISGID);
 	}
 
 	ASSERT_VOP_ELOCKED(vp, "chown2");
 
 	return (0);
 }
 
 /*
  * Change size of the given vnode.
  * Caller should execute tmpfs_update on vp after a successful execution.
  * The vnode must be locked on entry and remain locked on exit.
  */
 int
 tmpfs_chsize(struct vnode *vp, u_quad_t size, struct ucred *cred,
     struct thread *p)
 {
 	int error;
 	struct tmpfs_node *node;
 
 	ASSERT_VOP_ELOCKED(vp, "chsize");
 
 	node = VP_TO_TMPFS_NODE(vp);
 
 	/* Decide whether this is a valid operation based on the file type. */
 	error = 0;
 	switch (vp->v_type) {
 	case VDIR:
 		return EISDIR;
 
 	case VREG:
 		if (vp->v_mount->mnt_flag & MNT_RDONLY)
 			return EROFS;
 		break;
 
 	case VBLK:
 		/* FALLTHROUGH */
 	case VCHR:
 		/* FALLTHROUGH */
 	case VFIFO:
 		/* Allow modifications of special files even if in the file
 		 * system is mounted read-only (we are not modifying the
 		 * files themselves, but the objects they represent). */
 		return 0;
 
 	default:
 		/* Anything else is unsupported. */
 		return EOPNOTSUPP;
 	}
 
 	/* Immutable or append-only files cannot be modified, either. */
 	if (node->tn_flags & (IMMUTABLE | APPEND))
 		return EPERM;
 
 	error = tmpfs_truncate(vp, size);
 	/* tmpfs_truncate will raise the NOTE_EXTEND and NOTE_ATTRIB kevents
 	 * for us, as will update tn_status; no need to do that here. */
 
 	ASSERT_VOP_ELOCKED(vp, "chsize2");
 
 	return (error);
 }
 
 /*
  * Change access and modification times of the given vnode.
  * Caller should execute tmpfs_update on vp after a successful execution.
  * The vnode must be locked on entry and remain locked on exit.
  */
 int
 tmpfs_chtimes(struct vnode *vp, struct vattr *vap,
     struct ucred *cred, struct thread *l)
 {
 	int error;
 	struct tmpfs_node *node;
 
 	ASSERT_VOP_ELOCKED(vp, "chtimes");
 
 	node = VP_TO_TMPFS_NODE(vp);
 
 	/* Disallow this operation if the file system is mounted read-only. */
 	if (vp->v_mount->mnt_flag & MNT_RDONLY)
 		return EROFS;
 
 	/* Immutable or append-only files cannot be modified, either. */
 	if (node->tn_flags & (IMMUTABLE | APPEND))
 		return EPERM;
 
 	error = vn_utimes_perm(vp, vap, cred, l);
 	if (error != 0)
 		return (error);
 
 	if (vap->va_atime.tv_sec != VNOVAL)
 		node->tn_status |= TMPFS_NODE_ACCESSED;
 
 	if (vap->va_mtime.tv_sec != VNOVAL)
 		node->tn_status |= TMPFS_NODE_MODIFIED;
 
 	if (vap->va_birthtime.tv_sec != VNOVAL)
 		node->tn_status |= TMPFS_NODE_MODIFIED;
 
 	tmpfs_itimes(vp, &vap->va_atime, &vap->va_mtime);
 
 	if (vap->va_birthtime.tv_sec != VNOVAL)
 		node->tn_birthtime = vap->va_birthtime;
 	ASSERT_VOP_ELOCKED(vp, "chtimes2");
 
 	return (0);
 }
 
 void
 tmpfs_set_status(struct tmpfs_mount *tm, struct tmpfs_node *node, int status)
 {
 
 	if ((node->tn_status & status) == status || tm->tm_ronly)
 		return;
 	TMPFS_NODE_LOCK(node);
 	node->tn_status |= status;
 	TMPFS_NODE_UNLOCK(node);
 }
 
 /* Sync timestamps */
 void
 tmpfs_itimes(struct vnode *vp, const struct timespec *acc,
     const struct timespec *mod)
 {
 	struct tmpfs_node *node;
 	struct timespec now;
 
 	ASSERT_VOP_LOCKED(vp, "tmpfs_itimes");
 	node = VP_TO_TMPFS_NODE(vp);
 
 	if ((node->tn_status & (TMPFS_NODE_ACCESSED | TMPFS_NODE_MODIFIED |
 	    TMPFS_NODE_CHANGED)) == 0)
 		return;
 
 	vfs_timestamp(&now);
 	TMPFS_NODE_LOCK(node);
 	if (node->tn_status & TMPFS_NODE_ACCESSED) {
 		if (acc == NULL)
 			 acc = &now;
 		node->tn_atime = *acc;
 	}
 	if (node->tn_status & TMPFS_NODE_MODIFIED) {
 		if (mod == NULL)
 			mod = &now;
 		node->tn_mtime = *mod;
 	}
 	if (node->tn_status & TMPFS_NODE_CHANGED)
 		node->tn_ctime = now;
 	node->tn_status &= ~(TMPFS_NODE_ACCESSED | TMPFS_NODE_MODIFIED |
 	    TMPFS_NODE_CHANGED);
 	TMPFS_NODE_UNLOCK(node);
 
 	/* XXX: FIX? The entropy here is desirable, but the harvesting may be expensive */
 	random_harvest_queue(node, sizeof(*node), RANDOM_FS_ATIME);
 }
 
 int
 tmpfs_truncate(struct vnode *vp, off_t length)
 {
 	int error;
 	struct tmpfs_node *node;
 
 	node = VP_TO_TMPFS_NODE(vp);
 
 	if (length < 0) {
 		error = EINVAL;
 		goto out;
 	}
 
 	if (node->tn_size == length) {
 		error = 0;
 		goto out;
 	}
 
 	if (length > VFS_TO_TMPFS(vp->v_mount)->tm_maxfilesize)
 		return (EFBIG);
 
 	error = tmpfs_reg_resize(vp, length, FALSE);
 	if (error == 0)
 		node->tn_status |= TMPFS_NODE_CHANGED | TMPFS_NODE_MODIFIED;
 
 out:
 	tmpfs_update(vp);
 
 	return (error);
 }
 
 static __inline int
 tmpfs_dirtree_cmp(struct tmpfs_dirent *a, struct tmpfs_dirent *b)
 {
 	if (a->td_hash > b->td_hash)
 		return (1);
 	else if (a->td_hash < b->td_hash)
 		return (-1);
 	return (0);
 }
 
 RB_GENERATE_STATIC(tmpfs_dir, tmpfs_dirent, uh.td_entries, tmpfs_dirtree_cmp);
Index: head/sys/kern/uipc_shm.c
===================================================================
--- head/sys/kern/uipc_shm.c	(revision 362612)
+++ head/sys/kern/uipc_shm.c	(revision 362613)
@@ -1,1562 +1,1557 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2006, 2011, 2016-2017 Robert N. M. Watson
  * All rights reserved.
  *
  * Portions of this software were developed by BAE Systems, the University of
  * Cambridge Computer Laboratory, and Memorial University under DARPA/AFRL
  * contract FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent
  * Computing (TC) research program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * Support for shared swap-backed anonymous memory objects via
  * shm_open(2), shm_rename(2), and shm_unlink(2).
  * While most of the implementation is here, vm_mmap.c contains
  * mapping logic changes.
  *
  * posixshmcontrol(1) allows users to inspect the state of the memory
  * objects.  Per-uid swap resource limit controls total amount of
  * memory that user can consume for anonymous objects, including
  * shared.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_capsicum.h"
 #include "opt_ktrace.h"
 
 #include <sys/param.h>
 #include <sys/capsicum.h>
 #include <sys/conf.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/filio.h>
 #include <sys/fnv_hash.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/uio.h>
 #include <sys/signal.h>
 #include <sys/jail.h>
 #include <sys/ktrace.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mman.h>
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/refcount.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/sbuf.h>
 #include <sys/stat.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/systm.h>
 #include <sys/sx.h>
 #include <sys/time.h>
 #include <sys/vnode.h>
 #include <sys/unistd.h>
 #include <sys/user.h>
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_map.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 #include <vm/swap_pager.h>
 
 struct shm_mapping {
 	char		*sm_path;
 	Fnv32_t		sm_fnv;
 	struct shmfd	*sm_shmfd;
 	LIST_ENTRY(shm_mapping) sm_link;
 };
 
 static MALLOC_DEFINE(M_SHMFD, "shmfd", "shared memory file descriptor");
 static LIST_HEAD(, shm_mapping) *shm_dictionary;
 static struct sx shm_dict_lock;
 static struct mtx shm_timestamp_lock;
 static u_long shm_hash;
 static struct unrhdr64 shm_ino_unr;
 static dev_t shm_dev_ino;
 
 #define	SHM_HASH(fnv)	(&shm_dictionary[(fnv) & shm_hash])
 
 static void	shm_init(void *arg);
 static void	shm_insert(char *path, Fnv32_t fnv, struct shmfd *shmfd);
 static struct shmfd *shm_lookup(char *path, Fnv32_t fnv);
 static int	shm_remove(char *path, Fnv32_t fnv, struct ucred *ucred);
 static int	shm_dotruncate_locked(struct shmfd *shmfd, off_t length,
     void *rl_cookie);
 static int	shm_copyin_path(struct thread *td, const char *userpath_in,
     char **path_out);
 
 static fo_rdwr_t	shm_read;
 static fo_rdwr_t	shm_write;
 static fo_truncate_t	shm_truncate;
 static fo_ioctl_t	shm_ioctl;
 static fo_stat_t	shm_stat;
 static fo_close_t	shm_close;
 static fo_chmod_t	shm_chmod;
 static fo_chown_t	shm_chown;
 static fo_seek_t	shm_seek;
 static fo_fill_kinfo_t	shm_fill_kinfo;
 static fo_mmap_t	shm_mmap;
 static fo_get_seals_t	shm_get_seals;
 static fo_add_seals_t	shm_add_seals;
 static fo_fallocate_t	shm_fallocate;
 
 /* File descriptor operations. */
 struct fileops shm_ops = {
 	.fo_read = shm_read,
 	.fo_write = shm_write,
 	.fo_truncate = shm_truncate,
 	.fo_ioctl = shm_ioctl,
 	.fo_poll = invfo_poll,
 	.fo_kqfilter = invfo_kqfilter,
 	.fo_stat = shm_stat,
 	.fo_close = shm_close,
 	.fo_chmod = shm_chmod,
 	.fo_chown = shm_chown,
 	.fo_sendfile = vn_sendfile,
 	.fo_seek = shm_seek,
 	.fo_fill_kinfo = shm_fill_kinfo,
 	.fo_mmap = shm_mmap,
 	.fo_get_seals = shm_get_seals,
 	.fo_add_seals = shm_add_seals,
 	.fo_fallocate = shm_fallocate,
 	.fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE
 };
 
 FEATURE(posix_shm, "POSIX shared memory");
 
 static int
 uiomove_object_page(vm_object_t obj, size_t len, struct uio *uio)
 {
 	vm_page_t m;
 	vm_pindex_t idx;
 	size_t tlen;
 	int error, offset, rv;
 
 	idx = OFF_TO_IDX(uio->uio_offset);
 	offset = uio->uio_offset & PAGE_MASK;
 	tlen = MIN(PAGE_SIZE - offset, len);
 
 	rv = vm_page_grab_valid_unlocked(&m, obj, idx,
 	    VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY | VM_ALLOC_NOCREAT);
 	if (rv == VM_PAGER_OK)
 		goto found;
 
 	/*
 	 * Read I/O without either a corresponding resident page or swap
 	 * page: use zero_region.  This is intended to avoid instantiating
 	 * pages on read from a sparse region.
 	 */
 	VM_OBJECT_WLOCK(obj);
 	m = vm_page_lookup(obj, idx);
 	if (uio->uio_rw == UIO_READ && m == NULL &&
 	    !vm_pager_has_page(obj, idx, NULL, NULL)) {
 		VM_OBJECT_WUNLOCK(obj);
 		return (uiomove(__DECONST(void *, zero_region), tlen, uio));
 	}
 
 	/*
 	 * Although the tmpfs vnode lock is held here, it is
 	 * nonetheless safe to sleep waiting for a free page.  The
 	 * pageout daemon does not need to acquire the tmpfs vnode
 	 * lock to page out tobj's pages because tobj is a OBJT_SWAP
 	 * type object.
 	 */
 	rv = vm_page_grab_valid(&m, obj, idx,
 	    VM_ALLOC_NORMAL | VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY);
 	if (rv != VM_PAGER_OK) {
 		VM_OBJECT_WUNLOCK(obj);
 		printf("uiomove_object: vm_obj %p idx %jd pager error %d\n",
 		    obj, idx, rv);
 		return (EIO);
 	}
 	VM_OBJECT_WUNLOCK(obj);
 
 found:
 	error = uiomove_fromphys(&m, offset, tlen, uio);
 	if (uio->uio_rw == UIO_WRITE && error == 0)
 		vm_page_set_dirty(m);
 	vm_page_activate(m);
 	vm_page_sunbusy(m);
 
 	return (error);
 }
 
 int
 uiomove_object(vm_object_t obj, off_t obj_size, struct uio *uio)
 {
 	ssize_t resid;
 	size_t len;
 	int error;
 
 	error = 0;
 	while ((resid = uio->uio_resid) > 0) {
 		if (obj_size <= uio->uio_offset)
 			break;
 		len = MIN(obj_size - uio->uio_offset, resid);
 		if (len == 0)
 			break;
 		error = uiomove_object_page(obj, len, uio);
 		if (error != 0 || resid == uio->uio_resid)
 			break;
 	}
 	return (error);
 }
 
 static int
 shm_seek(struct file *fp, off_t offset, int whence, struct thread *td)
 {
 	struct shmfd *shmfd;
 	off_t foffset;
 	int error;
 
 	shmfd = fp->f_data;
 	foffset = foffset_lock(fp, 0);
 	error = 0;
 	switch (whence) {
 	case L_INCR:
 		if (foffset < 0 ||
 		    (offset > 0 && foffset > OFF_MAX - offset)) {
 			error = EOVERFLOW;
 			break;
 		}
 		offset += foffset;
 		break;
 	case L_XTND:
 		if (offset > 0 && shmfd->shm_size > OFF_MAX - offset) {
 			error = EOVERFLOW;
 			break;
 		}
 		offset += shmfd->shm_size;
 		break;
 	case L_SET:
 		break;
 	default:
 		error = EINVAL;
 	}
 	if (error == 0) {
 		if (offset < 0 || offset > shmfd->shm_size)
 			error = EINVAL;
 		else
 			td->td_uretoff.tdu_off = offset;
 	}
 	foffset_unlock(fp, offset, error != 0 ? FOF_NOUPDATE : 0);
 	return (error);
 }
 
 static int
 shm_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 	struct shmfd *shmfd;
 	void *rl_cookie;
 	int error;
 
 	shmfd = fp->f_data;
 #ifdef MAC
 	error = mac_posixshm_check_read(active_cred, fp->f_cred, shmfd);
 	if (error)
 		return (error);
 #endif
 	foffset_lock_uio(fp, uio, flags);
 	rl_cookie = rangelock_rlock(&shmfd->shm_rl, uio->uio_offset,
 	    uio->uio_offset + uio->uio_resid, &shmfd->shm_mtx);
 	error = uiomove_object(shmfd->shm_object, shmfd->shm_size, uio);
 	rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx);
 	foffset_unlock_uio(fp, uio, flags);
 	return (error);
 }
 
 static int
 shm_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 	struct shmfd *shmfd;
 	void *rl_cookie;
 	int error;
 
 	shmfd = fp->f_data;
 #ifdef MAC
 	error = mac_posixshm_check_write(active_cred, fp->f_cred, shmfd);
 	if (error)
 		return (error);
 #endif
 	foffset_lock_uio(fp, uio, flags);
 	if ((flags & FOF_OFFSET) == 0) {
 		rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX,
 		    &shmfd->shm_mtx);
 	} else {
 		rl_cookie = rangelock_wlock(&shmfd->shm_rl, uio->uio_offset,
 		    uio->uio_offset + uio->uio_resid, &shmfd->shm_mtx);
 	}
 	if ((shmfd->shm_seals & F_SEAL_WRITE) != 0)
 		error = EPERM;
 	else
 		error = uiomove_object(shmfd->shm_object, shmfd->shm_size, uio);
 	rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx);
 	foffset_unlock_uio(fp, uio, flags);
 	return (error);
 }
 
 static int
 shm_truncate(struct file *fp, off_t length, struct ucred *active_cred,
     struct thread *td)
 {
 	struct shmfd *shmfd;
 #ifdef MAC
 	int error;
 #endif
 
 	shmfd = fp->f_data;
 #ifdef MAC
 	error = mac_posixshm_check_truncate(active_cred, fp->f_cred, shmfd);
 	if (error)
 		return (error);
 #endif
 	return (shm_dotruncate(shmfd, length));
 }
 
 int
 shm_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred,
     struct thread *td)
 {
 
 	switch (com) {
 	case FIONBIO:
 	case FIOASYNC:
 		/*
 		 * Allow fcntl(fd, F_SETFL, O_NONBLOCK) to work,
 		 * just like it would on an unlinked regular file
 		 */
 		return (0);
 	default:
 		return (ENOTTY);
 	}
 }
 
 static int
 shm_stat(struct file *fp, struct stat *sb, struct ucred *active_cred,
     struct thread *td)
 {
 	struct shmfd *shmfd;
 #ifdef MAC
 	int error;
 #endif
 
 	shmfd = fp->f_data;
 
 #ifdef MAC
 	error = mac_posixshm_check_stat(active_cred, fp->f_cred, shmfd);
 	if (error)
 		return (error);
 #endif
 	
 	/*
 	 * Attempt to return sanish values for fstat() on a memory file
 	 * descriptor.
 	 */
 	bzero(sb, sizeof(*sb));
 	sb->st_blksize = PAGE_SIZE;
 	sb->st_size = shmfd->shm_size;
 	sb->st_blocks = howmany(sb->st_size, sb->st_blksize);
 	mtx_lock(&shm_timestamp_lock);
 	sb->st_atim = shmfd->shm_atime;
 	sb->st_ctim = shmfd->shm_ctime;
 	sb->st_mtim = shmfd->shm_mtime;
 	sb->st_birthtim = shmfd->shm_birthtime;
 	sb->st_mode = S_IFREG | shmfd->shm_mode;		/* XXX */
 	sb->st_uid = shmfd->shm_uid;
 	sb->st_gid = shmfd->shm_gid;
 	mtx_unlock(&shm_timestamp_lock);
 	sb->st_dev = shm_dev_ino;
 	sb->st_ino = shmfd->shm_ino;
 	sb->st_nlink = shmfd->shm_object->ref_count;
 
 	return (0);
 }
 
 static int
 shm_close(struct file *fp, struct thread *td)
 {
 	struct shmfd *shmfd;
 
 	shmfd = fp->f_data;
 	fp->f_data = NULL;
 	shm_drop(shmfd);
 
 	return (0);
 }
 
 static int
 shm_copyin_path(struct thread *td, const char *userpath_in, char **path_out) {
 	int error;
 	char *path;
 	const char *pr_path;
 	size_t pr_pathlen;
 
 	path = malloc(MAXPATHLEN, M_SHMFD, M_WAITOK);
 	pr_path = td->td_ucred->cr_prison->pr_path;
 
 	/* Construct a full pathname for jailed callers. */
 	pr_pathlen = strcmp(pr_path, "/") ==
 	    0 ? 0 : strlcpy(path, pr_path, MAXPATHLEN);
 	error = copyinstr(userpath_in, path + pr_pathlen,
 	    MAXPATHLEN - pr_pathlen, NULL);
 	if (error != 0)
 		goto out;
 
 #ifdef KTRACE
 	if (KTRPOINT(curthread, KTR_NAMEI))
 		ktrnamei(path);
 #endif
 
 	/* Require paths to start with a '/' character. */
 	if (path[pr_pathlen] != '/') {
 		error = EINVAL;
 		goto out;
 	}
 
 	*path_out = path;
 
 out:
 	if (error != 0)
 		free(path, M_SHMFD);
 
 	return (error);
 }
 
 static int
 shm_dotruncate_locked(struct shmfd *shmfd, off_t length, void *rl_cookie)
 {
 	vm_object_t object;
 	vm_page_t m;
 	vm_pindex_t idx, nobjsize;
 	vm_ooffset_t delta;
 	int base, rv;
 
 	KASSERT(length >= 0, ("shm_dotruncate: length < 0"));
 	object = shmfd->shm_object;
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	rangelock_cookie_assert(rl_cookie, RA_WLOCKED);
 	if (length == shmfd->shm_size)
 		return (0);
 	nobjsize = OFF_TO_IDX(length + PAGE_MASK);
 
 	/* Are we shrinking?  If so, trim the end. */
 	if (length < shmfd->shm_size) {
 		if ((shmfd->shm_seals & F_SEAL_SHRINK) != 0)
 			return (EPERM);
 
 		/*
 		 * Disallow any requests to shrink the size if this
 		 * object is mapped into the kernel.
 		 */
 		if (shmfd->shm_kmappings > 0)
 			return (EBUSY);
 
 		/*
 		 * Zero the truncated part of the last page.
 		 */
 		base = length & PAGE_MASK;
 		if (base != 0) {
 			idx = OFF_TO_IDX(length);
 retry:
 			m = vm_page_grab(object, idx, VM_ALLOC_NOCREAT);
 			if (m != NULL) {
 				MPASS(vm_page_all_valid(m));
 			} else if (vm_pager_has_page(object, idx, NULL, NULL)) {
 				m = vm_page_alloc(object, idx,
 				    VM_ALLOC_NORMAL | VM_ALLOC_WAITFAIL);
 				if (m == NULL)
 					goto retry;
 				vm_object_pip_add(object, 1);
 				VM_OBJECT_WUNLOCK(object);
 				rv = vm_pager_get_pages(object, &m, 1, NULL,
 				    NULL);
 				VM_OBJECT_WLOCK(object);
 				vm_object_pip_wakeup(object);
 				if (rv == VM_PAGER_OK) {
 					/*
 					 * Since the page was not resident,
 					 * and therefore not recently
 					 * accessed, immediately enqueue it
 					 * for asynchronous laundering.  The
 					 * current operation is not regarded
 					 * as an access.
 					 */
 					vm_page_launder(m);
 				} else {
 					vm_page_free(m);
 					VM_OBJECT_WUNLOCK(object);
 					return (EIO);
 				}
 			}
 			if (m != NULL) {
 				pmap_zero_page_area(m, base, PAGE_SIZE - base);
 				KASSERT(vm_page_all_valid(m),
 				    ("shm_dotruncate: page %p is invalid", m));
 				vm_page_set_dirty(m);
 				vm_page_xunbusy(m);
 			}
 		}
 		delta = IDX_TO_OFF(object->size - nobjsize);
 
-		/* Toss in memory pages. */
 		if (nobjsize < object->size)
 			vm_object_page_remove(object, nobjsize, object->size,
 			    0);
-
-		/* Toss pages from swap. */
-		if (object->type == OBJT_SWAP)
-			swap_pager_freespace(object, nobjsize, delta);
 
 		/* Free the swap accounted for shm */
 		swap_release_by_cred(delta, object->cred);
 		object->charge -= delta;
 	} else {
 		if ((shmfd->shm_seals & F_SEAL_GROW) != 0)
 			return (EPERM);
 
 		/* Try to reserve additional swap space. */
 		delta = IDX_TO_OFF(nobjsize - object->size);
 		if (!swap_reserve_by_cred(delta, object->cred))
 			return (ENOMEM);
 		object->charge += delta;
 	}
 	shmfd->shm_size = length;
 	mtx_lock(&shm_timestamp_lock);
 	vfs_timestamp(&shmfd->shm_ctime);
 	shmfd->shm_mtime = shmfd->shm_ctime;
 	mtx_unlock(&shm_timestamp_lock);
 	object->size = nobjsize;
 	return (0);
 }
 
 int
 shm_dotruncate(struct shmfd *shmfd, off_t length)
 {
 	void *rl_cookie;
 	int error;
 
 	rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX,
 	    &shmfd->shm_mtx);
 	VM_OBJECT_WLOCK(shmfd->shm_object);
 	error = shm_dotruncate_locked(shmfd, length, rl_cookie);
 	VM_OBJECT_WUNLOCK(shmfd->shm_object);
 	rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx);
 	return (error);
 }
 
 /*
  * shmfd object management including creation and reference counting
  * routines.
  */
 struct shmfd *
 shm_alloc(struct ucred *ucred, mode_t mode)
 {
 	struct shmfd *shmfd;
 
 	shmfd = malloc(sizeof(*shmfd), M_SHMFD, M_WAITOK | M_ZERO);
 	shmfd->shm_size = 0;
 	shmfd->shm_uid = ucred->cr_uid;
 	shmfd->shm_gid = ucred->cr_gid;
 	shmfd->shm_mode = mode;
 	shmfd->shm_object = vm_pager_allocate(OBJT_SWAP, NULL,
 	    shmfd->shm_size, VM_PROT_DEFAULT, 0, ucred);
 	KASSERT(shmfd->shm_object != NULL, ("shm_create: vm_pager_allocate"));
 	vfs_timestamp(&shmfd->shm_birthtime);
 	shmfd->shm_atime = shmfd->shm_mtime = shmfd->shm_ctime =
 	    shmfd->shm_birthtime;
 	shmfd->shm_ino = alloc_unr64(&shm_ino_unr);
 	refcount_init(&shmfd->shm_refs, 1);
 	mtx_init(&shmfd->shm_mtx, "shmrl", NULL, MTX_DEF);
 	rangelock_init(&shmfd->shm_rl);
 #ifdef MAC
 	mac_posixshm_init(shmfd);
 	mac_posixshm_create(ucred, shmfd);
 #endif
 
 	return (shmfd);
 }
 
 struct shmfd *
 shm_hold(struct shmfd *shmfd)
 {
 
 	refcount_acquire(&shmfd->shm_refs);
 	return (shmfd);
 }
 
 void
 shm_drop(struct shmfd *shmfd)
 {
 
 	if (refcount_release(&shmfd->shm_refs)) {
 #ifdef MAC
 		mac_posixshm_destroy(shmfd);
 #endif
 		rangelock_destroy(&shmfd->shm_rl);
 		mtx_destroy(&shmfd->shm_mtx);
 		vm_object_deallocate(shmfd->shm_object);
 		free(shmfd, M_SHMFD);
 	}
 }
 
 /*
  * Determine if the credentials have sufficient permissions for a
  * specified combination of FREAD and FWRITE.
  */
 int
 shm_access(struct shmfd *shmfd, struct ucred *ucred, int flags)
 {
 	accmode_t accmode;
 	int error;
 
 	accmode = 0;
 	if (flags & FREAD)
 		accmode |= VREAD;
 	if (flags & FWRITE)
 		accmode |= VWRITE;
 	mtx_lock(&shm_timestamp_lock);
 	error = vaccess(VREG, shmfd->shm_mode, shmfd->shm_uid, shmfd->shm_gid,
 	    accmode, ucred, NULL);
 	mtx_unlock(&shm_timestamp_lock);
 	return (error);
 }
 
 /*
  * Dictionary management.  We maintain an in-kernel dictionary to map
  * paths to shmfd objects.  We use the FNV hash on the path to store
  * the mappings in a hash table.
  */
 static void
 shm_init(void *arg)
 {
 
 	mtx_init(&shm_timestamp_lock, "shm timestamps", NULL, MTX_DEF);
 	sx_init(&shm_dict_lock, "shm dictionary");
 	shm_dictionary = hashinit(1024, M_SHMFD, &shm_hash);
 	new_unrhdr64(&shm_ino_unr, 1);
 	shm_dev_ino = devfs_alloc_cdp_inode();
 	KASSERT(shm_dev_ino > 0, ("shm dev inode not initialized"));
 }
 SYSINIT(shm_init, SI_SUB_SYSV_SHM, SI_ORDER_ANY, shm_init, NULL);
 
 static struct shmfd *
 shm_lookup(char *path, Fnv32_t fnv)
 {
 	struct shm_mapping *map;
 
 	LIST_FOREACH(map, SHM_HASH(fnv), sm_link) {
 		if (map->sm_fnv != fnv)
 			continue;
 		if (strcmp(map->sm_path, path) == 0)
 			return (map->sm_shmfd);
 	}
 
 	return (NULL);
 }
 
 static void
 shm_insert(char *path, Fnv32_t fnv, struct shmfd *shmfd)
 {
 	struct shm_mapping *map;
 
 	map = malloc(sizeof(struct shm_mapping), M_SHMFD, M_WAITOK);
 	map->sm_path = path;
 	map->sm_fnv = fnv;
 	map->sm_shmfd = shm_hold(shmfd);
 	shmfd->shm_path = path;
 	LIST_INSERT_HEAD(SHM_HASH(fnv), map, sm_link);
 }
 
 static int
 shm_remove(char *path, Fnv32_t fnv, struct ucred *ucred)
 {
 	struct shm_mapping *map;
 	int error;
 
 	LIST_FOREACH(map, SHM_HASH(fnv), sm_link) {
 		if (map->sm_fnv != fnv)
 			continue;
 		if (strcmp(map->sm_path, path) == 0) {
 #ifdef MAC
 			error = mac_posixshm_check_unlink(ucred, map->sm_shmfd);
 			if (error)
 				return (error);
 #endif
 			error = shm_access(map->sm_shmfd, ucred,
 			    FREAD | FWRITE);
 			if (error)
 				return (error);
 			map->sm_shmfd->shm_path = NULL;
 			LIST_REMOVE(map, sm_link);
 			shm_drop(map->sm_shmfd);
 			free(map->sm_path, M_SHMFD);
 			free(map, M_SHMFD);
 			return (0);
 		}
 	}
 
 	return (ENOENT);
 }
 
 int
 kern_shm_open2(struct thread *td, const char *userpath, int flags, mode_t mode,
     int shmflags, struct filecaps *fcaps, const char *name __unused)
 {
 	struct filedesc *fdp;
 	struct shmfd *shmfd;
 	struct file *fp;
 	char *path;
 	void *rl_cookie;
 	Fnv32_t fnv;
 	mode_t cmode;
 	int error, fd, initial_seals;
 
 	if ((shmflags & ~SHM_ALLOW_SEALING) != 0)
 		return (EINVAL);
 
 	initial_seals = F_SEAL_SEAL;
 	if ((shmflags & SHM_ALLOW_SEALING) != 0)
 		initial_seals &= ~F_SEAL_SEAL;
 
 #ifdef CAPABILITY_MODE
 	/*
 	 * shm_open(2) is only allowed for anonymous objects.
 	 */
 	if (IN_CAPABILITY_MODE(td) && (userpath != SHM_ANON))
 		return (ECAPMODE);
 #endif
 
 	AUDIT_ARG_FFLAGS(flags);
 	AUDIT_ARG_MODE(mode);
 
 	if ((flags & O_ACCMODE) != O_RDONLY && (flags & O_ACCMODE) != O_RDWR)
 		return (EINVAL);
 
 	if ((flags & ~(O_ACCMODE | O_CREAT | O_EXCL | O_TRUNC | O_CLOEXEC)) != 0)
 		return (EINVAL);
 
 	/*
 	 * Currently only F_SEAL_SEAL may be set when creating or opening shmfd.
 	 * If the decision is made later to allow additional seals, care must be
 	 * taken below to ensure that the seals are properly set if the shmfd
 	 * already existed -- this currently assumes that only F_SEAL_SEAL can
 	 * be set and doesn't take further precautions to ensure the validity of
 	 * the seals being added with respect to current mappings.
 	 */
 	if ((initial_seals & ~F_SEAL_SEAL) != 0)
 		return (EINVAL);
 
 	fdp = td->td_proc->p_fd;
 	cmode = (mode & ~fdp->fd_cmask) & ACCESSPERMS;
 
 	/*
 	 * shm_open(2) created shm should always have O_CLOEXEC set, as mandated
 	 * by POSIX.  We allow it to be unset here so that an in-kernel
 	 * interface may be written as a thin layer around shm, optionally not
 	 * setting CLOEXEC.  For shm_open(2), O_CLOEXEC is set unconditionally
 	 * in sys_shm_open() to keep this implementation compliant.
 	 */
 	error = falloc_caps(td, &fp, &fd, flags & O_CLOEXEC, fcaps);
 	if (error)
 		return (error);
 
 	/* A SHM_ANON path pointer creates an anonymous object. */
 	if (userpath == SHM_ANON) {
 		/* A read-only anonymous object is pointless. */
 		if ((flags & O_ACCMODE) == O_RDONLY) {
 			fdclose(td, fp, fd);
 			fdrop(fp, td);
 			return (EINVAL);
 		}
 		shmfd = shm_alloc(td->td_ucred, cmode);
 		shmfd->shm_seals = initial_seals;
 	} else {
 		error = shm_copyin_path(td, userpath, &path);
 		if (error != 0) {
 			fdclose(td, fp, fd);
 			fdrop(fp, td);
 			return (error);
 		}
 
 		AUDIT_ARG_UPATH1_CANON(path);
 		fnv = fnv_32_str(path, FNV1_32_INIT);
 		sx_xlock(&shm_dict_lock);
 		shmfd = shm_lookup(path, fnv);
 		if (shmfd == NULL) {
 			/* Object does not yet exist, create it if requested. */
 			if (flags & O_CREAT) {
 #ifdef MAC
 				error = mac_posixshm_check_create(td->td_ucred,
 				    path);
 				if (error == 0) {
 #endif
 					shmfd = shm_alloc(td->td_ucred, cmode);
 					shmfd->shm_seals = initial_seals;
 					shm_insert(path, fnv, shmfd);
 #ifdef MAC
 				}
 #endif
 			} else {
 				free(path, M_SHMFD);
 				error = ENOENT;
 			}
 		} else {
 			rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX,
 			    &shmfd->shm_mtx);
 
 			/*
 			 * kern_shm_open() likely shouldn't ever error out on
 			 * trying to set a seal that already exists, unlike
 			 * F_ADD_SEALS.  This would break terribly as
 			 * shm_open(2) actually sets F_SEAL_SEAL to maintain
 			 * historical behavior where the underlying file could
 			 * not be sealed.
 			 */
 			initial_seals &= ~shmfd->shm_seals;
 
 			/*
 			 * Object already exists, obtain a new
 			 * reference if requested and permitted.
 			 */
 			free(path, M_SHMFD);
 
 			/*
 			 * initial_seals can't set additional seals if we've
 			 * already been set F_SEAL_SEAL.  If F_SEAL_SEAL is set,
 			 * then we've already removed that one from
 			 * initial_seals.  This is currently redundant as we
 			 * only allow setting F_SEAL_SEAL at creation time, but
 			 * it's cheap to check and decreases the effort required
 			 * to allow additional seals.
 			 */
 			if ((shmfd->shm_seals & F_SEAL_SEAL) != 0 &&
 			    initial_seals != 0)
 				error = EPERM;
 			else if ((flags & (O_CREAT | O_EXCL)) ==
 			    (O_CREAT | O_EXCL))
 				error = EEXIST;
 			else {
 #ifdef MAC
 				error = mac_posixshm_check_open(td->td_ucred,
 				    shmfd, FFLAGS(flags & O_ACCMODE));
 				if (error == 0)
 #endif
 				error = shm_access(shmfd, td->td_ucred,
 				    FFLAGS(flags & O_ACCMODE));
 			}
 
 			/*
 			 * Truncate the file back to zero length if
 			 * O_TRUNC was specified and the object was
 			 * opened with read/write.
 			 */
 			if (error == 0 &&
 			    (flags & (O_ACCMODE | O_TRUNC)) ==
 			    (O_RDWR | O_TRUNC)) {
 				VM_OBJECT_WLOCK(shmfd->shm_object);
 #ifdef MAC
 				error = mac_posixshm_check_truncate(
 					td->td_ucred, fp->f_cred, shmfd);
 				if (error == 0)
 #endif
 					error = shm_dotruncate_locked(shmfd, 0,
 					    rl_cookie);
 				VM_OBJECT_WUNLOCK(shmfd->shm_object);
 			}
 			if (error == 0) {
 				/*
 				 * Currently we only allow F_SEAL_SEAL to be
 				 * set initially.  As noted above, this would
 				 * need to be reworked should that change.
 				 */
 				shmfd->shm_seals |= initial_seals;
 				shm_hold(shmfd);
 			}
 			rangelock_unlock(&shmfd->shm_rl, rl_cookie,
 			    &shmfd->shm_mtx);
 		}
 		sx_xunlock(&shm_dict_lock);
 
 		if (error) {
 			fdclose(td, fp, fd);
 			fdrop(fp, td);
 			return (error);
 		}
 	}
 
 	finit(fp, FFLAGS(flags & O_ACCMODE), DTYPE_SHM, shmfd, &shm_ops);
 
 	td->td_retval[0] = fd;
 	fdrop(fp, td);
 
 	return (0);
 }
 
 /* System calls. */
 #ifdef COMPAT_FREEBSD12
 int
 freebsd12_shm_open(struct thread *td, struct freebsd12_shm_open_args *uap)
 {
 
 	return (kern_shm_open(td, uap->path, uap->flags | O_CLOEXEC,
 	    uap->mode, NULL));
 }
 #endif
 
 int
 sys_shm_unlink(struct thread *td, struct shm_unlink_args *uap)
 {
 	char *path;
 	Fnv32_t fnv;
 	int error;
 
 	error = shm_copyin_path(td, uap->path, &path);
 	if (error != 0)
 		return (error);
 
 	AUDIT_ARG_UPATH1_CANON(path);
 	fnv = fnv_32_str(path, FNV1_32_INIT);
 	sx_xlock(&shm_dict_lock);
 	error = shm_remove(path, fnv, td->td_ucred);
 	sx_xunlock(&shm_dict_lock);
 	free(path, M_SHMFD);
 
 	return (error);
 }
 
 int
 sys_shm_rename(struct thread *td, struct shm_rename_args *uap)
 {
 	char *path_from = NULL, *path_to = NULL;
 	Fnv32_t fnv_from, fnv_to;
 	struct shmfd *fd_from;
 	struct shmfd *fd_to;
 	int error;
 	int flags;
 
 	flags = uap->flags;
 	AUDIT_ARG_FFLAGS(flags);
 
 	/*
 	 * Make sure the user passed only valid flags.
 	 * If you add a new flag, please add a new term here.
 	 */
 	if ((flags & ~(
 	    SHM_RENAME_NOREPLACE |
 	    SHM_RENAME_EXCHANGE
 	    )) != 0) {
 		error = EINVAL;
 		goto out;
 	}
 
 	/*
 	 * EXCHANGE and NOREPLACE don't quite make sense together. Let's
 	 * force the user to choose one or the other.
 	 */
 	if ((flags & SHM_RENAME_NOREPLACE) != 0 &&
 	    (flags & SHM_RENAME_EXCHANGE) != 0) {
 		error = EINVAL;
 		goto out;
 	}
 
 	/* Renaming to or from anonymous makes no sense */
 	if (uap->path_from == SHM_ANON || uap->path_to == SHM_ANON) {
 		error = EINVAL;
 		goto out;
 	}
 
 	error = shm_copyin_path(td, uap->path_from, &path_from);
 	if (error != 0)
 		goto out;
 
 	error = shm_copyin_path(td, uap->path_to, &path_to);
 	if (error != 0)
 		goto out;
 
 	AUDIT_ARG_UPATH1_CANON(path_from);
 	AUDIT_ARG_UPATH2_CANON(path_to);
 
 	/* Rename with from/to equal is a no-op */
 	if (strcmp(path_from, path_to) == 0)
 		goto out;
 
 	fnv_from = fnv_32_str(path_from, FNV1_32_INIT);
 	fnv_to = fnv_32_str(path_to, FNV1_32_INIT);
 
 	sx_xlock(&shm_dict_lock);
 
 	fd_from = shm_lookup(path_from, fnv_from);
 	if (fd_from == NULL) {
 		error = ENOENT;
 		goto out_locked;
 	}
 
 	fd_to = shm_lookup(path_to, fnv_to);
 	if ((flags & SHM_RENAME_NOREPLACE) != 0 && fd_to != NULL) {
 		error = EEXIST;
 		goto out_locked;
 	}
 
 	/*
 	 * Unconditionally prevents shm_remove from invalidating the 'from'
 	 * shm's state.
 	 */
 	shm_hold(fd_from);
 	error = shm_remove(path_from, fnv_from, td->td_ucred);
 
 	/*
 	 * One of my assumptions failed if ENOENT (e.g. locking didn't
 	 * protect us)
 	 */
 	KASSERT(error != ENOENT, ("Our shm disappeared during shm_rename: %s",
 	    path_from));
 	if (error != 0) {
 		shm_drop(fd_from);
 		goto out_locked;
 	}
 
 	/*
 	 * If we are exchanging, we need to ensure the shm_remove below
 	 * doesn't invalidate the dest shm's state.
 	 */
 	if ((flags & SHM_RENAME_EXCHANGE) != 0 && fd_to != NULL)
 		shm_hold(fd_to);
 
 	/*
 	 * NOTE: if path_to is not already in the hash, c'est la vie;
 	 * it simply means we have nothing already at path_to to unlink.
 	 * That is the ENOENT case.
 	 *
 	 * If we somehow don't have access to unlink this guy, but
 	 * did for the shm at path_from, then relink the shm to path_from
 	 * and abort with EACCES.
 	 *
 	 * All other errors: that is weird; let's relink and abort the
 	 * operation.
 	 */
 	error = shm_remove(path_to, fnv_to, td->td_ucred);
 	if (error != 0 && error != ENOENT) {
 		shm_insert(path_from, fnv_from, fd_from);
 		shm_drop(fd_from);
 		/* Don't free path_from now, since the hash references it */
 		path_from = NULL;
 		goto out_locked;
 	}
 
 	error = 0;
 
 	shm_insert(path_to, fnv_to, fd_from);
 
 	/* Don't free path_to now, since the hash references it */
 	path_to = NULL;
 
 	/* We kept a ref when we removed, and incremented again in insert */
 	shm_drop(fd_from);
 	KASSERT(fd_from->shm_refs > 0, ("Expected >0 refs; got: %d\n",
 	    fd_from->shm_refs));
 
 	if ((flags & SHM_RENAME_EXCHANGE) != 0 && fd_to != NULL) {
 		shm_insert(path_from, fnv_from, fd_to);
 		path_from = NULL;
 		shm_drop(fd_to);
 		KASSERT(fd_to->shm_refs > 0, ("Expected >0 refs; got: %d\n",
 		    fd_to->shm_refs));
 	}
 
 out_locked:
 	sx_xunlock(&shm_dict_lock);
 
 out:
 	free(path_from, M_SHMFD);
 	free(path_to, M_SHMFD);
 	return (error);
 }
 
 int
 shm_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t objsize,
     vm_prot_t prot, vm_prot_t cap_maxprot, int flags,
     vm_ooffset_t foff, struct thread *td)
 {
 	struct shmfd *shmfd;
 	vm_prot_t maxprot;
 	int error;
 	bool writecnt;
 	void *rl_cookie;
 
 	shmfd = fp->f_data;
 	maxprot = VM_PROT_NONE;
 
 	rl_cookie = rangelock_rlock(&shmfd->shm_rl, 0, objsize,
 	    &shmfd->shm_mtx);
 	/* FREAD should always be set. */
 	if ((fp->f_flag & FREAD) != 0)
 		maxprot |= VM_PROT_EXECUTE | VM_PROT_READ;
 
 	/*
 	 * If FWRITE's set, we can allow VM_PROT_WRITE unless it's a shared
 	 * mapping with a write seal applied.  Private mappings are always
 	 * writeable.
 	 */
 	if ((flags & MAP_SHARED) == 0) {
 		cap_maxprot |= VM_PROT_WRITE;
 		maxprot |= VM_PROT_WRITE;
 		writecnt = false;
 	} else {
 		if ((fp->f_flag & FWRITE) != 0 &&
 		    (shmfd->shm_seals & F_SEAL_WRITE) == 0)
 			maxprot |= VM_PROT_WRITE;
 
 		/*
 		 * Any mappings from a writable descriptor may be upgraded to
 		 * VM_PROT_WRITE with mprotect(2), unless a write-seal was
 		 * applied between the open and subsequent mmap(2).  We want to
 		 * reject application of a write seal as long as any such
 		 * mapping exists so that the seal cannot be trivially bypassed.
 		 */
 		writecnt = (maxprot & VM_PROT_WRITE) != 0;
 		if (!writecnt && (prot & VM_PROT_WRITE) != 0) {
 			error = EACCES;
 			goto out;
 		}
 	}
 	maxprot &= cap_maxprot;
 
 	/* See comment in vn_mmap(). */
 	if (
 #ifdef _LP64
 	    objsize > OFF_MAX ||
 #endif
 	    foff < 0 || foff > OFF_MAX - objsize) {
 		error = EINVAL;
 		goto out;
 	}
 
 #ifdef MAC
 	error = mac_posixshm_check_mmap(td->td_ucred, shmfd, prot, flags);
 	if (error != 0)
 		goto out;
 #endif
 	
 	mtx_lock(&shm_timestamp_lock);
 	vfs_timestamp(&shmfd->shm_atime);
 	mtx_unlock(&shm_timestamp_lock);
 	vm_object_reference(shmfd->shm_object);
 
 	if (writecnt)
 		vm_pager_update_writecount(shmfd->shm_object, 0, objsize);
 	error = vm_mmap_object(map, addr, objsize, prot, maxprot, flags,
 	    shmfd->shm_object, foff, writecnt, td);
 	if (error != 0) {
 		if (writecnt)
 			vm_pager_release_writecount(shmfd->shm_object, 0,
 			    objsize);
 		vm_object_deallocate(shmfd->shm_object);
 	}
 out:
 	rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx);
 	return (error);
 }
 
 static int
 shm_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
     struct thread *td)
 {
 	struct shmfd *shmfd;
 	int error;
 
 	error = 0;
 	shmfd = fp->f_data;
 	mtx_lock(&shm_timestamp_lock);
 	/*
 	 * SUSv4 says that x bits of permission need not be affected.
 	 * Be consistent with our shm_open there.
 	 */
 #ifdef MAC
 	error = mac_posixshm_check_setmode(active_cred, shmfd, mode);
 	if (error != 0)
 		goto out;
 #endif
 	error = vaccess(VREG, shmfd->shm_mode, shmfd->shm_uid,
 	    shmfd->shm_gid, VADMIN, active_cred, NULL);
 	if (error != 0)
 		goto out;
 	shmfd->shm_mode = mode & ACCESSPERMS;
 out:
 	mtx_unlock(&shm_timestamp_lock);
 	return (error);
 }
 
 static int
 shm_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
     struct thread *td)
 {
 	struct shmfd *shmfd;
 	int error;
 
 	error = 0;
 	shmfd = fp->f_data;
 	mtx_lock(&shm_timestamp_lock);
 #ifdef MAC
 	error = mac_posixshm_check_setowner(active_cred, shmfd, uid, gid);
 	if (error != 0)
 		goto out;
 #endif
 	if (uid == (uid_t)-1)
 		uid = shmfd->shm_uid;
 	if (gid == (gid_t)-1)
                  gid = shmfd->shm_gid;
 	if (((uid != shmfd->shm_uid && uid != active_cred->cr_uid) ||
 	    (gid != shmfd->shm_gid && !groupmember(gid, active_cred))) &&
 	    (error = priv_check_cred(active_cred, PRIV_VFS_CHOWN)))
 		goto out;
 	shmfd->shm_uid = uid;
 	shmfd->shm_gid = gid;
 out:
 	mtx_unlock(&shm_timestamp_lock);
 	return (error);
 }
 
 /*
  * Helper routines to allow the backing object of a shared memory file
  * descriptor to be mapped in the kernel.
  */
 int
 shm_map(struct file *fp, size_t size, off_t offset, void **memp)
 {
 	struct shmfd *shmfd;
 	vm_offset_t kva, ofs;
 	vm_object_t obj;
 	int rv;
 
 	if (fp->f_type != DTYPE_SHM)
 		return (EINVAL);
 	shmfd = fp->f_data;
 	obj = shmfd->shm_object;
 	VM_OBJECT_WLOCK(obj);
 	/*
 	 * XXXRW: This validation is probably insufficient, and subject to
 	 * sign errors.  It should be fixed.
 	 */
 	if (offset >= shmfd->shm_size ||
 	    offset + size > round_page(shmfd->shm_size)) {
 		VM_OBJECT_WUNLOCK(obj);
 		return (EINVAL);
 	}
 
 	shmfd->shm_kmappings++;
 	vm_object_reference_locked(obj);
 	VM_OBJECT_WUNLOCK(obj);
 
 	/* Map the object into the kernel_map and wire it. */
 	kva = vm_map_min(kernel_map);
 	ofs = offset & PAGE_MASK;
 	offset = trunc_page(offset);
 	size = round_page(size + ofs);
 	rv = vm_map_find(kernel_map, obj, offset, &kva, size, 0,
 	    VMFS_OPTIMAL_SPACE, VM_PROT_READ | VM_PROT_WRITE,
 	    VM_PROT_READ | VM_PROT_WRITE, 0);
 	if (rv == KERN_SUCCESS) {
 		rv = vm_map_wire(kernel_map, kva, kva + size,
 		    VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES);
 		if (rv == KERN_SUCCESS) {
 			*memp = (void *)(kva + ofs);
 			return (0);
 		}
 		vm_map_remove(kernel_map, kva, kva + size);
 	} else
 		vm_object_deallocate(obj);
 
 	/* On failure, drop our mapping reference. */
 	VM_OBJECT_WLOCK(obj);
 	shmfd->shm_kmappings--;
 	VM_OBJECT_WUNLOCK(obj);
 
 	return (vm_mmap_to_errno(rv));
 }
 
 /*
  * We require the caller to unmap the entire entry.  This allows us to
  * safely decrement shm_kmappings when a mapping is removed.
  */
 int
 shm_unmap(struct file *fp, void *mem, size_t size)
 {
 	struct shmfd *shmfd;
 	vm_map_entry_t entry;
 	vm_offset_t kva, ofs;
 	vm_object_t obj;
 	vm_pindex_t pindex;
 	vm_prot_t prot;
 	boolean_t wired;
 	vm_map_t map;
 	int rv;
 
 	if (fp->f_type != DTYPE_SHM)
 		return (EINVAL);
 	shmfd = fp->f_data;
 	kva = (vm_offset_t)mem;
 	ofs = kva & PAGE_MASK;
 	kva = trunc_page(kva);
 	size = round_page(size + ofs);
 	map = kernel_map;
 	rv = vm_map_lookup(&map, kva, VM_PROT_READ | VM_PROT_WRITE, &entry,
 	    &obj, &pindex, &prot, &wired);
 	if (rv != KERN_SUCCESS)
 		return (EINVAL);
 	if (entry->start != kva || entry->end != kva + size) {
 		vm_map_lookup_done(map, entry);
 		return (EINVAL);
 	}
 	vm_map_lookup_done(map, entry);
 	if (obj != shmfd->shm_object)
 		return (EINVAL);
 	vm_map_remove(map, kva, kva + size);
 	VM_OBJECT_WLOCK(obj);
 	KASSERT(shmfd->shm_kmappings > 0, ("shm_unmap: object not mapped"));
 	shmfd->shm_kmappings--;
 	VM_OBJECT_WUNLOCK(obj);
 	return (0);
 }
 
 static int
 shm_fill_kinfo_locked(struct shmfd *shmfd, struct kinfo_file *kif, bool list)
 {
 	const char *path, *pr_path;
 	size_t pr_pathlen;
 	bool visible;
 
 	sx_assert(&shm_dict_lock, SA_LOCKED);
 	kif->kf_type = KF_TYPE_SHM;
 	kif->kf_un.kf_file.kf_file_mode = S_IFREG | shmfd->shm_mode;
 	kif->kf_un.kf_file.kf_file_size = shmfd->shm_size;
 	if (shmfd->shm_path != NULL) {
 		if (shmfd->shm_path != NULL) {
 			path = shmfd->shm_path;
 			pr_path = curthread->td_ucred->cr_prison->pr_path;
 			if (strcmp(pr_path, "/") != 0) {
 				/* Return the jail-rooted pathname. */
 				pr_pathlen = strlen(pr_path);
 				visible = strncmp(path, pr_path, pr_pathlen)
 				    == 0 && path[pr_pathlen] == '/';
 				if (list && !visible)
 					return (EPERM);
 				if (visible)
 					path += pr_pathlen;
 			}
 			strlcpy(kif->kf_path, path, sizeof(kif->kf_path));
 		}
 	}
 	return (0);
 }
 
 static int
 shm_fill_kinfo(struct file *fp, struct kinfo_file *kif,
     struct filedesc *fdp __unused)
 {
 	int res;
 
 	sx_slock(&shm_dict_lock);
 	res = shm_fill_kinfo_locked(fp->f_data, kif, false);
 	sx_sunlock(&shm_dict_lock);
 	return (res);
 }
 
 static int
 shm_add_seals(struct file *fp, int seals)
 {
 	struct shmfd *shmfd;
 	void *rl_cookie;
 	vm_ooffset_t writemappings;
 	int error, nseals;
 
 	error = 0;
 	shmfd = fp->f_data;
 	rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX,
 	    &shmfd->shm_mtx);
 
 	/* Even already-set seals should result in EPERM. */
 	if ((shmfd->shm_seals & F_SEAL_SEAL) != 0) {
 		error = EPERM;
 		goto out;
 	}
 	nseals = seals & ~shmfd->shm_seals;
 	if ((nseals & F_SEAL_WRITE) != 0) {
 		/*
 		 * The rangelock above prevents writable mappings from being
 		 * added after we've started applying seals.  The RLOCK here
 		 * is to avoid torn reads on ILP32 arches as unmapping/reducing
 		 * writemappings will be done without a rangelock.
 		 */
 		VM_OBJECT_RLOCK(shmfd->shm_object);
 		writemappings = shmfd->shm_object->un_pager.swp.writemappings;
 		VM_OBJECT_RUNLOCK(shmfd->shm_object);
 		/* kmappings are also writable */
 		if (writemappings > 0) {
 			error = EBUSY;
 			goto out;
 		}
 	}
 	shmfd->shm_seals |= nseals;
 out:
 	rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx);
 	return (error);
 }
 
 static int
 shm_get_seals(struct file *fp, int *seals)
 {
 	struct shmfd *shmfd;
 
 	shmfd = fp->f_data;
 	*seals = shmfd->shm_seals;
 	return (0);
 }
 
 static int
 shm_fallocate(struct file *fp, off_t offset, off_t len, struct thread *td)
 {
 	void *rl_cookie;
 	struct shmfd *shmfd;
 	size_t size;
 	int error;
 
 	/* This assumes that the caller already checked for overflow. */
 	error = 0;
 	shmfd = fp->f_data;
 	size = offset + len;
 
 	/*
 	 * Just grab the rangelock for the range that we may be attempting to
 	 * grow, rather than blocking read/write for regions we won't be
 	 * touching while this (potential) resize is in progress.  Other
 	 * attempts to resize the shmfd will have to take a write lock from 0 to
 	 * OFF_MAX, so this being potentially beyond the current usable range of
 	 * the shmfd is not necessarily a concern.  If other mechanisms are
 	 * added to grow a shmfd, this may need to be re-evaluated.
 	 */
 	rl_cookie = rangelock_wlock(&shmfd->shm_rl, offset, size,
 	    &shmfd->shm_mtx);
 	if (size > shmfd->shm_size) {
 		VM_OBJECT_WLOCK(shmfd->shm_object);
 		error = shm_dotruncate_locked(shmfd, size, rl_cookie);
 		VM_OBJECT_WUNLOCK(shmfd->shm_object);
 	}
 	rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx);
 	/* Translate to posix_fallocate(2) return value as needed. */
 	if (error == ENOMEM)
 		error = ENOSPC;
 	return (error);
 }
 
 static int
 sysctl_posix_shm_list(SYSCTL_HANDLER_ARGS)
 {
 	struct shm_mapping *shmm;
 	struct sbuf sb;
 	struct kinfo_file kif;
 	u_long i;
 	ssize_t curlen;
 	int error, error2;
 
 	sbuf_new_for_sysctl(&sb, NULL, sizeof(struct kinfo_file) * 5, req);
 	sbuf_clear_flags(&sb, SBUF_INCLUDENUL);
 	curlen = 0;
 	error = 0;
 	sx_slock(&shm_dict_lock);
 	for (i = 0; i < shm_hash + 1; i++) {
 		LIST_FOREACH(shmm, &shm_dictionary[i], sm_link) {
 			error = shm_fill_kinfo_locked(shmm->sm_shmfd,
 			    &kif, true);
 			if (error == EPERM)
 				continue;
 			if (error != 0)
 				break;
 			pack_kinfo(&kif);
 			if (req->oldptr != NULL &&
 			    kif.kf_structsize + curlen > req->oldlen)
 				break;
 			error = sbuf_bcat(&sb, &kif, kif.kf_structsize) == 0 ?
 			    0 : ENOMEM;
 			if (error != 0)
 				break;
 			curlen += kif.kf_structsize;
 		}
 	}
 	sx_sunlock(&shm_dict_lock);
 	error2 = sbuf_finish(&sb);
 	sbuf_delete(&sb);
 	return (error != 0 ? error : error2);
 }
 
 SYSCTL_PROC(_kern_ipc, OID_AUTO, posix_shm_list,
     CTLFLAG_RD | CTLFLAG_MPSAFE | CTLTYPE_OPAQUE,
     NULL, 0, sysctl_posix_shm_list, "",
     "POSIX SHM list");
 
 int
 kern_shm_open(struct thread *td, const char *path, int flags, mode_t mode,
     struct filecaps *caps)
 {
 
 	return (kern_shm_open2(td, path, flags, mode, 0, caps, NULL));
 }
 
 /*
  * This version of the shm_open() interface leaves CLOEXEC behavior up to the
  * caller, and libc will enforce it for the traditional shm_open() call.  This
  * allows other consumers, like memfd_create(), to opt-in for CLOEXEC.  This
  * interface also includes a 'name' argument that is currently unused, but could
  * potentially be exported later via some interface for debugging purposes.
  * From the kernel's perspective, it is optional.  Individual consumers like
  * memfd_create() may require it in order to be compatible with other systems
  * implementing the same function.
  */
 int
 sys_shm_open2(struct thread *td, struct shm_open2_args *uap)
 {
 
 	return (kern_shm_open2(td, uap->path, uap->flags, uap->mode,
 	    uap->shmflags, NULL, uap->name));
 }
Index: head/sys/vm/vm_map.c
===================================================================
--- head/sys/vm/vm_map.c	(revision 362612)
+++ head/sys/vm/vm_map.c	(revision 362613)
@@ -1,5172 +1,5168 @@
 /*-
  * SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU)
  *
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * The Mach Operating System project at Carnegie-Mellon University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)vm_map.c	8.3 (Berkeley) 1/12/94
  *
  *
  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  * All rights reserved.
  *
  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
  *
  * Permission to use, copy, modify and distribute this software and
  * its documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  *
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  */
 
 /*
  *	Virtual memory mapping module.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/elf.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/vmmeter.h>
 #include <sys/mman.h>
 #include <sys/vnode.h>
 #include <sys/racct.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/file.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/shm.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_object.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 #include <vm/vnode_pager.h>
 #include <vm/swap_pager.h>
 #include <vm/uma.h>
 
 /*
  *	Virtual memory maps provide for the mapping, protection,
  *	and sharing of virtual memory objects.  In addition,
  *	this module provides for an efficient virtual copy of
  *	memory from one map to another.
  *
  *	Synchronization is required prior to most operations.
  *
  *	Maps consist of an ordered doubly-linked list of simple
  *	entries; a self-adjusting binary search tree of these
  *	entries is used to speed up lookups.
  *
  *	Since portions of maps are specified by start/end addresses,
  *	which may not align with existing map entries, all
  *	routines merely "clip" entries to these start/end values.
  *	[That is, an entry is split into two, bordering at a
  *	start or end value.]  Note that these clippings may not
  *	always be necessary (as the two resulting entries are then
  *	not changed); however, the clipping is done for convenience.
  *
  *	As mentioned above, virtual copy operations are performed
  *	by copying VM object references from one map to
  *	another, and then marking both regions as copy-on-write.
  */
 
 static struct mtx map_sleep_mtx;
 static uma_zone_t mapentzone;
 static uma_zone_t kmapentzone;
 static uma_zone_t mapzone;
 static uma_zone_t vmspace_zone;
 static int vmspace_zinit(void *mem, int size, int flags);
 static int vm_map_zinit(void *mem, int ize, int flags);
 static void _vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min,
     vm_offset_t max);
 static void vm_map_entry_deallocate(vm_map_entry_t entry, boolean_t system_map);
 static void vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry);
 static void vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry);
 static int vm_map_growstack(vm_map_t map, vm_offset_t addr,
     vm_map_entry_t gap_entry);
 static void vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_prot_t prot,
     vm_object_t object, vm_pindex_t pindex, vm_size_t size, int flags);
 #ifdef INVARIANTS
 static void vm_map_zdtor(void *mem, int size, void *arg);
 static void vmspace_zdtor(void *mem, int size, void *arg);
 #endif
 static int vm_map_stack_locked(vm_map_t map, vm_offset_t addrbos,
     vm_size_t max_ssize, vm_size_t growsize, vm_prot_t prot, vm_prot_t max,
     int cow);
 static void vm_map_wire_entry_failure(vm_map_t map, vm_map_entry_t entry,
     vm_offset_t failed_addr);
 
 #define	ENTRY_CHARGED(e) ((e)->cred != NULL || \
     ((e)->object.vm_object != NULL && (e)->object.vm_object->cred != NULL && \
      !((e)->eflags & MAP_ENTRY_NEEDS_COPY)))
 
 /* 
  * PROC_VMSPACE_{UN,}LOCK() can be a noop as long as vmspaces are type
  * stable.
  */
 #define PROC_VMSPACE_LOCK(p) do { } while (0)
 #define PROC_VMSPACE_UNLOCK(p) do { } while (0)
 
 /*
  *	VM_MAP_RANGE_CHECK:	[ internal use only ]
  *
  *	Asserts that the starting and ending region
  *	addresses fall within the valid range of the map.
  */
 #define	VM_MAP_RANGE_CHECK(map, start, end)		\
 		{					\
 		if (start < vm_map_min(map))		\
 			start = vm_map_min(map);	\
 		if (end > vm_map_max(map))		\
 			end = vm_map_max(map);		\
 		if (start > end)			\
 			start = end;			\
 		}
 
 /*
  *	vm_map_startup:
  *
  *	Initialize the vm_map module.  Must be called before
  *	any other vm_map routines.
  *
  *	Map and entry structures are allocated from the general
  *	purpose memory pool with some exceptions:
  *
  *	- The kernel map and kmem submap are allocated statically.
  *	- Kernel map entries are allocated out of a static pool.
  *
  *	These restrictions are necessary since malloc() uses the
  *	maps and requires map entries.
  */
 
 void
 vm_map_startup(void)
 {
 	mtx_init(&map_sleep_mtx, "vm map sleep mutex", NULL, MTX_DEF);
 	mapzone = uma_zcreate("MAP", sizeof(struct vm_map), NULL,
 #ifdef INVARIANTS
 	    vm_map_zdtor,
 #else
 	    NULL,
 #endif
 	    vm_map_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	uma_prealloc(mapzone, MAX_KMAP);
 	kmapentzone = uma_zcreate("KMAP ENTRY", sizeof(struct vm_map_entry),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
 	    UMA_ZONE_MTXCLASS | UMA_ZONE_VM);
 	mapentzone = uma_zcreate("MAP ENTRY", sizeof(struct vm_map_entry),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	vmspace_zone = uma_zcreate("VMSPACE", sizeof(struct vmspace), NULL,
 #ifdef INVARIANTS
 	    vmspace_zdtor,
 #else
 	    NULL,
 #endif
 	    vmspace_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 }
 
 static int
 vmspace_zinit(void *mem, int size, int flags)
 {
 	struct vmspace *vm;
 
 	vm = (struct vmspace *)mem;
 
 	vm->vm_map.pmap = NULL;
 	(void)vm_map_zinit(&vm->vm_map, sizeof(vm->vm_map), flags);
 	PMAP_LOCK_INIT(vmspace_pmap(vm));
 	return (0);
 }
 
 static int
 vm_map_zinit(void *mem, int size, int flags)
 {
 	vm_map_t map;
 
 	map = (vm_map_t)mem;
 	memset(map, 0, sizeof(*map));
 	mtx_init(&map->system_mtx, "vm map (system)", NULL, MTX_DEF | MTX_DUPOK);
 	sx_init(&map->lock, "vm map (user)");
 	return (0);
 }
 
 #ifdef INVARIANTS
 static void
 vmspace_zdtor(void *mem, int size, void *arg)
 {
 	struct vmspace *vm;
 
 	vm = (struct vmspace *)mem;
 
 	vm_map_zdtor(&vm->vm_map, sizeof(vm->vm_map), arg);
 }
 static void
 vm_map_zdtor(void *mem, int size, void *arg)
 {
 	vm_map_t map;
 
 	map = (vm_map_t)mem;
 	KASSERT(map->nentries == 0,
 	    ("map %p nentries == %d on free.",
 	    map, map->nentries));
 	KASSERT(map->size == 0,
 	    ("map %p size == %lu on free.",
 	    map, (unsigned long)map->size));
 }
 #endif	/* INVARIANTS */
 
 /*
  * Allocate a vmspace structure, including a vm_map and pmap,
  * and initialize those structures.  The refcnt is set to 1.
  *
  * If 'pinit' is NULL then the embedded pmap is initialized via pmap_pinit().
  */
 struct vmspace *
 vmspace_alloc(vm_offset_t min, vm_offset_t max, pmap_pinit_t pinit)
 {
 	struct vmspace *vm;
 
 	vm = uma_zalloc(vmspace_zone, M_WAITOK);
 	KASSERT(vm->vm_map.pmap == NULL, ("vm_map.pmap must be NULL"));
 	if (!pinit(vmspace_pmap(vm))) {
 		uma_zfree(vmspace_zone, vm);
 		return (NULL);
 	}
 	CTR1(KTR_VM, "vmspace_alloc: %p", vm);
 	_vm_map_init(&vm->vm_map, vmspace_pmap(vm), min, max);
 	vm->vm_refcnt = 1;
 	vm->vm_shm = NULL;
 	vm->vm_swrss = 0;
 	vm->vm_tsize = 0;
 	vm->vm_dsize = 0;
 	vm->vm_ssize = 0;
 	vm->vm_taddr = 0;
 	vm->vm_daddr = 0;
 	vm->vm_maxsaddr = 0;
 	return (vm);
 }
 
 #ifdef RACCT
 static void
 vmspace_container_reset(struct proc *p)
 {
 
 	PROC_LOCK(p);
 	racct_set(p, RACCT_DATA, 0);
 	racct_set(p, RACCT_STACK, 0);
 	racct_set(p, RACCT_RSS, 0);
 	racct_set(p, RACCT_MEMLOCK, 0);
 	racct_set(p, RACCT_VMEM, 0);
 	PROC_UNLOCK(p);
 }
 #endif
 
 static inline void
 vmspace_dofree(struct vmspace *vm)
 {
 
 	CTR1(KTR_VM, "vmspace_free: %p", vm);
 
 	/*
 	 * Make sure any SysV shm is freed, it might not have been in
 	 * exit1().
 	 */
 	shmexit(vm);
 
 	/*
 	 * Lock the map, to wait out all other references to it.
 	 * Delete all of the mappings and pages they hold, then call
 	 * the pmap module to reclaim anything left.
 	 */
 	(void)vm_map_remove(&vm->vm_map, vm_map_min(&vm->vm_map),
 	    vm_map_max(&vm->vm_map));
 
 	pmap_release(vmspace_pmap(vm));
 	vm->vm_map.pmap = NULL;
 	uma_zfree(vmspace_zone, vm);
 }
 
 void
 vmspace_free(struct vmspace *vm)
 {
 
 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
 	    "vmspace_free() called");
 
 	if (vm->vm_refcnt == 0)
 		panic("vmspace_free: attempt to free already freed vmspace");
 
 	if (atomic_fetchadd_int(&vm->vm_refcnt, -1) == 1)
 		vmspace_dofree(vm);
 }
 
 void
 vmspace_exitfree(struct proc *p)
 {
 	struct vmspace *vm;
 
 	PROC_VMSPACE_LOCK(p);
 	vm = p->p_vmspace;
 	p->p_vmspace = NULL;
 	PROC_VMSPACE_UNLOCK(p);
 	KASSERT(vm == &vmspace0, ("vmspace_exitfree: wrong vmspace"));
 	vmspace_free(vm);
 }
 
 void
 vmspace_exit(struct thread *td)
 {
 	int refcnt;
 	struct vmspace *vm;
 	struct proc *p;
 
 	/*
 	 * Release user portion of address space.
 	 * This releases references to vnodes,
 	 * which could cause I/O if the file has been unlinked.
 	 * Need to do this early enough that we can still sleep.
 	 *
 	 * The last exiting process to reach this point releases as
 	 * much of the environment as it can. vmspace_dofree() is the
 	 * slower fallback in case another process had a temporary
 	 * reference to the vmspace.
 	 */
 
 	p = td->td_proc;
 	vm = p->p_vmspace;
 	atomic_add_int(&vmspace0.vm_refcnt, 1);
 	refcnt = vm->vm_refcnt;
 	do {
 		if (refcnt > 1 && p->p_vmspace != &vmspace0) {
 			/* Switch now since other proc might free vmspace */
 			PROC_VMSPACE_LOCK(p);
 			p->p_vmspace = &vmspace0;
 			PROC_VMSPACE_UNLOCK(p);
 			pmap_activate(td);
 		}
 	} while (!atomic_fcmpset_int(&vm->vm_refcnt, &refcnt, refcnt - 1));
 	if (refcnt == 1) {
 		if (p->p_vmspace != vm) {
 			/* vmspace not yet freed, switch back */
 			PROC_VMSPACE_LOCK(p);
 			p->p_vmspace = vm;
 			PROC_VMSPACE_UNLOCK(p);
 			pmap_activate(td);
 		}
 		pmap_remove_pages(vmspace_pmap(vm));
 		/* Switch now since this proc will free vmspace */
 		PROC_VMSPACE_LOCK(p);
 		p->p_vmspace = &vmspace0;
 		PROC_VMSPACE_UNLOCK(p);
 		pmap_activate(td);
 		vmspace_dofree(vm);
 	}
 #ifdef RACCT
 	if (racct_enable)
 		vmspace_container_reset(p);
 #endif
 }
 
 /* Acquire reference to vmspace owned by another process. */
 
 struct vmspace *
 vmspace_acquire_ref(struct proc *p)
 {
 	struct vmspace *vm;
 	int refcnt;
 
 	PROC_VMSPACE_LOCK(p);
 	vm = p->p_vmspace;
 	if (vm == NULL) {
 		PROC_VMSPACE_UNLOCK(p);
 		return (NULL);
 	}
 	refcnt = vm->vm_refcnt;
 	do {
 		if (refcnt <= 0) { 	/* Avoid 0->1 transition */
 			PROC_VMSPACE_UNLOCK(p);
 			return (NULL);
 		}
 	} while (!atomic_fcmpset_int(&vm->vm_refcnt, &refcnt, refcnt + 1));
 	if (vm != p->p_vmspace) {
 		PROC_VMSPACE_UNLOCK(p);
 		vmspace_free(vm);
 		return (NULL);
 	}
 	PROC_VMSPACE_UNLOCK(p);
 	return (vm);
 }
 
 /*
  * Switch between vmspaces in an AIO kernel process.
  *
  * The new vmspace is either the vmspace of a user process obtained
  * from an active AIO request or the initial vmspace of the AIO kernel
  * process (when it is idling).  Because user processes will block to
  * drain any active AIO requests before proceeding in exit() or
  * execve(), the reference count for vmspaces from AIO requests can
  * never be 0.  Similarly, AIO kernel processes hold an extra
  * reference on their initial vmspace for the life of the process.  As
  * a result, the 'newvm' vmspace always has a non-zero reference
  * count.  This permits an additional reference on 'newvm' to be
  * acquired via a simple atomic increment rather than the loop in
  * vmspace_acquire_ref() above.
  */
 void
 vmspace_switch_aio(struct vmspace *newvm)
 {
 	struct vmspace *oldvm;
 
 	/* XXX: Need some way to assert that this is an aio daemon. */
 
 	KASSERT(newvm->vm_refcnt > 0,
 	    ("vmspace_switch_aio: newvm unreferenced"));
 
 	oldvm = curproc->p_vmspace;
 	if (oldvm == newvm)
 		return;
 
 	/*
 	 * Point to the new address space and refer to it.
 	 */
 	curproc->p_vmspace = newvm;
 	atomic_add_int(&newvm->vm_refcnt, 1);
 
 	/* Activate the new mapping. */
 	pmap_activate(curthread);
 
 	vmspace_free(oldvm);
 }
 
 void
 _vm_map_lock(vm_map_t map, const char *file, int line)
 {
 
 	if (map->system_map)
 		mtx_lock_flags_(&map->system_mtx, 0, file, line);
 	else
 		sx_xlock_(&map->lock, file, line);
 	map->timestamp++;
 }
 
 void
 vm_map_entry_set_vnode_text(vm_map_entry_t entry, bool add)
 {
 	vm_object_t object;
 	struct vnode *vp;
 	bool vp_held;
 
 	if ((entry->eflags & MAP_ENTRY_VN_EXEC) == 0)
 		return;
 	KASSERT((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0,
 	    ("Submap with execs"));
 	object = entry->object.vm_object;
 	KASSERT(object != NULL, ("No object for text, entry %p", entry));
 	if ((object->flags & OBJ_ANON) != 0)
 		object = object->handle;
 	else
 		KASSERT(object->backing_object == NULL,
 		    ("non-anon object %p shadows", object));
 	KASSERT(object != NULL, ("No content object for text, entry %p obj %p",
 	    entry, entry->object.vm_object));
 
 	/*
 	 * Mostly, we do not lock the backing object.  It is
 	 * referenced by the entry we are processing, so it cannot go
 	 * away.
 	 */
 	vp = NULL;
 	vp_held = false;
 	if (object->type == OBJT_DEAD) {
 		/*
 		 * For OBJT_DEAD objects, v_writecount was handled in
 		 * vnode_pager_dealloc().
 		 */
 	} else if (object->type == OBJT_VNODE) {
 		vp = object->handle;
 	} else if (object->type == OBJT_SWAP) {
 		KASSERT((object->flags & OBJ_TMPFS_NODE) != 0,
 		    ("vm_map_entry_set_vnode_text: swap and !TMPFS "
 		    "entry %p, object %p, add %d", entry, object, add));
 		/*
 		 * Tmpfs VREG node, which was reclaimed, has
 		 * OBJ_TMPFS_NODE flag set, but not OBJ_TMPFS.  In
 		 * this case there is no v_writecount to adjust.
 		 */
 		VM_OBJECT_RLOCK(object);
 		if ((object->flags & OBJ_TMPFS) != 0) {
 			vp = object->un_pager.swp.swp_tmpfs;
 			if (vp != NULL) {
 				vhold(vp);
 				vp_held = true;
 			}
 		}
 		VM_OBJECT_RUNLOCK(object);
 	} else {
 		KASSERT(0,
 		    ("vm_map_entry_set_vnode_text: wrong object type, "
 		    "entry %p, object %p, add %d", entry, object, add));
 	}
 	if (vp != NULL) {
 		if (add) {
 			VOP_SET_TEXT_CHECKED(vp);
 		} else {
 			vn_lock(vp, LK_SHARED | LK_RETRY);
 			VOP_UNSET_TEXT_CHECKED(vp);
 			VOP_UNLOCK(vp);
 		}
 		if (vp_held)
 			vdrop(vp);
 	}
 }
 
 /*
  * Use a different name for this vm_map_entry field when it's use
  * is not consistent with its use as part of an ordered search tree.
  */
 #define defer_next right
 
 static void
 vm_map_process_deferred(void)
 {
 	struct thread *td;
 	vm_map_entry_t entry, next;
 	vm_object_t object;
 
 	td = curthread;
 	entry = td->td_map_def_user;
 	td->td_map_def_user = NULL;
 	while (entry != NULL) {
 		next = entry->defer_next;
 		MPASS((entry->eflags & (MAP_ENTRY_WRITECNT |
 		    MAP_ENTRY_VN_EXEC)) != (MAP_ENTRY_WRITECNT |
 		    MAP_ENTRY_VN_EXEC));
 		if ((entry->eflags & MAP_ENTRY_WRITECNT) != 0) {
 			/*
 			 * Decrement the object's writemappings and
 			 * possibly the vnode's v_writecount.
 			 */
 			KASSERT((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0,
 			    ("Submap with writecount"));
 			object = entry->object.vm_object;
 			KASSERT(object != NULL, ("No object for writecount"));
 			vm_pager_release_writecount(object, entry->start,
 			    entry->end);
 		}
 		vm_map_entry_set_vnode_text(entry, false);
 		vm_map_entry_deallocate(entry, FALSE);
 		entry = next;
 	}
 }
 
 #ifdef INVARIANTS
 static void
 _vm_map_assert_locked(vm_map_t map, const char *file, int line)
 {
 
 	if (map->system_map)
 		mtx_assert_(&map->system_mtx, MA_OWNED, file, line);
 	else
 		sx_assert_(&map->lock, SA_XLOCKED, file, line);
 }
 
 #define	VM_MAP_ASSERT_LOCKED(map) \
     _vm_map_assert_locked(map, LOCK_FILE, LOCK_LINE)
 
 enum { VMMAP_CHECK_NONE, VMMAP_CHECK_UNLOCK, VMMAP_CHECK_ALL };
 #ifdef DIAGNOSTIC
 static int enable_vmmap_check = VMMAP_CHECK_UNLOCK;
 #else
 static int enable_vmmap_check = VMMAP_CHECK_NONE;
 #endif
 SYSCTL_INT(_debug, OID_AUTO, vmmap_check, CTLFLAG_RWTUN,
     &enable_vmmap_check, 0, "Enable vm map consistency checking");
 
 static void _vm_map_assert_consistent(vm_map_t map, int check);
 
 #define VM_MAP_ASSERT_CONSISTENT(map) \
     _vm_map_assert_consistent(map, VMMAP_CHECK_ALL)
 #ifdef DIAGNOSTIC
 #define VM_MAP_UNLOCK_CONSISTENT(map) do {				\
 	if (map->nupdates > map->nentries) {				\
 		_vm_map_assert_consistent(map, VMMAP_CHECK_UNLOCK);	\
 		map->nupdates = 0;					\
 	}								\
 } while (0)
 #else
 #define VM_MAP_UNLOCK_CONSISTENT(map)
 #endif
 #else
 #define	VM_MAP_ASSERT_LOCKED(map)
 #define VM_MAP_ASSERT_CONSISTENT(map)
 #define VM_MAP_UNLOCK_CONSISTENT(map)
 #endif /* INVARIANTS */
 
 void
 _vm_map_unlock(vm_map_t map, const char *file, int line)
 {
 
 	VM_MAP_UNLOCK_CONSISTENT(map);
 	if (map->system_map)
 		mtx_unlock_flags_(&map->system_mtx, 0, file, line);
 	else {
 		sx_xunlock_(&map->lock, file, line);
 		vm_map_process_deferred();
 	}
 }
 
 void
 _vm_map_lock_read(vm_map_t map, const char *file, int line)
 {
 
 	if (map->system_map)
 		mtx_lock_flags_(&map->system_mtx, 0, file, line);
 	else
 		sx_slock_(&map->lock, file, line);
 }
 
 void
 _vm_map_unlock_read(vm_map_t map, const char *file, int line)
 {
 
 	if (map->system_map)
 		mtx_unlock_flags_(&map->system_mtx, 0, file, line);
 	else {
 		sx_sunlock_(&map->lock, file, line);
 		vm_map_process_deferred();
 	}
 }
 
 int
 _vm_map_trylock(vm_map_t map, const char *file, int line)
 {
 	int error;
 
 	error = map->system_map ?
 	    !mtx_trylock_flags_(&map->system_mtx, 0, file, line) :
 	    !sx_try_xlock_(&map->lock, file, line);
 	if (error == 0)
 		map->timestamp++;
 	return (error == 0);
 }
 
 int
 _vm_map_trylock_read(vm_map_t map, const char *file, int line)
 {
 	int error;
 
 	error = map->system_map ?
 	    !mtx_trylock_flags_(&map->system_mtx, 0, file, line) :
 	    !sx_try_slock_(&map->lock, file, line);
 	return (error == 0);
 }
 
 /*
  *	_vm_map_lock_upgrade:	[ internal use only ]
  *
  *	Tries to upgrade a read (shared) lock on the specified map to a write
  *	(exclusive) lock.  Returns the value "0" if the upgrade succeeds and a
  *	non-zero value if the upgrade fails.  If the upgrade fails, the map is
  *	returned without a read or write lock held.
  *
  *	Requires that the map be read locked.
  */
 int
 _vm_map_lock_upgrade(vm_map_t map, const char *file, int line)
 {
 	unsigned int last_timestamp;
 
 	if (map->system_map) {
 		mtx_assert_(&map->system_mtx, MA_OWNED, file, line);
 	} else {
 		if (!sx_try_upgrade_(&map->lock, file, line)) {
 			last_timestamp = map->timestamp;
 			sx_sunlock_(&map->lock, file, line);
 			vm_map_process_deferred();
 			/*
 			 * If the map's timestamp does not change while the
 			 * map is unlocked, then the upgrade succeeds.
 			 */
 			sx_xlock_(&map->lock, file, line);
 			if (last_timestamp != map->timestamp) {
 				sx_xunlock_(&map->lock, file, line);
 				return (1);
 			}
 		}
 	}
 	map->timestamp++;
 	return (0);
 }
 
 void
 _vm_map_lock_downgrade(vm_map_t map, const char *file, int line)
 {
 
 	if (map->system_map) {
 		mtx_assert_(&map->system_mtx, MA_OWNED, file, line);
 	} else {
 		VM_MAP_UNLOCK_CONSISTENT(map);
 		sx_downgrade_(&map->lock, file, line);
 	}
 }
 
 /*
  *	vm_map_locked:
  *
  *	Returns a non-zero value if the caller holds a write (exclusive) lock
  *	on the specified map and the value "0" otherwise.
  */
 int
 vm_map_locked(vm_map_t map)
 {
 
 	if (map->system_map)
 		return (mtx_owned(&map->system_mtx));
 	else
 		return (sx_xlocked(&map->lock));
 }
 
 /*
  *	_vm_map_unlock_and_wait:
  *
  *	Atomically releases the lock on the specified map and puts the calling
  *	thread to sleep.  The calling thread will remain asleep until either
  *	vm_map_wakeup() is performed on the map or the specified timeout is
  *	exceeded.
  *
  *	WARNING!  This function does not perform deferred deallocations of
  *	objects and map	entries.  Therefore, the calling thread is expected to
  *	reacquire the map lock after reawakening and later perform an ordinary
  *	unlock operation, such as vm_map_unlock(), before completing its
  *	operation on the map.
  */
 int
 _vm_map_unlock_and_wait(vm_map_t map, int timo, const char *file, int line)
 {
 
 	VM_MAP_UNLOCK_CONSISTENT(map);
 	mtx_lock(&map_sleep_mtx);
 	if (map->system_map)
 		mtx_unlock_flags_(&map->system_mtx, 0, file, line);
 	else
 		sx_xunlock_(&map->lock, file, line);
 	return (msleep(&map->root, &map_sleep_mtx, PDROP | PVM, "vmmaps",
 	    timo));
 }
 
 /*
  *	vm_map_wakeup:
  *
  *	Awaken any threads that have slept on the map using
  *	vm_map_unlock_and_wait().
  */
 void
 vm_map_wakeup(vm_map_t map)
 {
 
 	/*
 	 * Acquire and release map_sleep_mtx to prevent a wakeup()
 	 * from being performed (and lost) between the map unlock
 	 * and the msleep() in _vm_map_unlock_and_wait().
 	 */
 	mtx_lock(&map_sleep_mtx);
 	mtx_unlock(&map_sleep_mtx);
 	wakeup(&map->root);
 }
 
 void
 vm_map_busy(vm_map_t map)
 {
 
 	VM_MAP_ASSERT_LOCKED(map);
 	map->busy++;
 }
 
 void
 vm_map_unbusy(vm_map_t map)
 {
 
 	VM_MAP_ASSERT_LOCKED(map);
 	KASSERT(map->busy, ("vm_map_unbusy: not busy"));
 	if (--map->busy == 0 && (map->flags & MAP_BUSY_WAKEUP)) {
 		vm_map_modflags(map, 0, MAP_BUSY_WAKEUP);
 		wakeup(&map->busy);
 	}
 }
 
 void 
 vm_map_wait_busy(vm_map_t map)
 {
 
 	VM_MAP_ASSERT_LOCKED(map);
 	while (map->busy) {
 		vm_map_modflags(map, MAP_BUSY_WAKEUP, 0);
 		if (map->system_map)
 			msleep(&map->busy, &map->system_mtx, 0, "mbusy", 0);
 		else
 			sx_sleep(&map->busy, &map->lock, 0, "mbusy", 0);
 	}
 	map->timestamp++;
 }
 
 long
 vmspace_resident_count(struct vmspace *vmspace)
 {
 	return pmap_resident_count(vmspace_pmap(vmspace));
 }
 
 /*
  *	vm_map_create:
  *
  *	Creates and returns a new empty VM map with
  *	the given physical map structure, and having
  *	the given lower and upper address bounds.
  */
 vm_map_t
 vm_map_create(pmap_t pmap, vm_offset_t min, vm_offset_t max)
 {
 	vm_map_t result;
 
 	result = uma_zalloc(mapzone, M_WAITOK);
 	CTR1(KTR_VM, "vm_map_create: %p", result);
 	_vm_map_init(result, pmap, min, max);
 	return (result);
 }
 
 /*
  * Initialize an existing vm_map structure
  * such as that in the vmspace structure.
  */
 static void
 _vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min, vm_offset_t max)
 {
 
 	map->header.eflags = MAP_ENTRY_HEADER;
 	map->needs_wakeup = FALSE;
 	map->system_map = 0;
 	map->pmap = pmap;
 	map->header.end = min;
 	map->header.start = max;
 	map->flags = 0;
 	map->header.left = map->header.right = &map->header;
 	map->root = NULL;
 	map->timestamp = 0;
 	map->busy = 0;
 	map->anon_loc = 0;
 #ifdef DIAGNOSTIC
 	map->nupdates = 0;
 #endif
 }
 
 void
 vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min, vm_offset_t max)
 {
 
 	_vm_map_init(map, pmap, min, max);
 	mtx_init(&map->system_mtx, "system map", NULL, MTX_DEF | MTX_DUPOK);
 	sx_init(&map->lock, "user map");
 }
 
 /*
  *	vm_map_entry_dispose:	[ internal use only ]
  *
  *	Inverse of vm_map_entry_create.
  */
 static void
 vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry)
 {
 	uma_zfree(map->system_map ? kmapentzone : mapentzone, entry);
 }
 
 /*
  *	vm_map_entry_create:	[ internal use only ]
  *
  *	Allocates a VM map entry for insertion.
  *	No entry fields are filled in.
  */
 static vm_map_entry_t
 vm_map_entry_create(vm_map_t map)
 {
 	vm_map_entry_t new_entry;
 
 	if (map->system_map)
 		new_entry = uma_zalloc(kmapentzone, M_NOWAIT);
 	else
 		new_entry = uma_zalloc(mapentzone, M_WAITOK);
 	if (new_entry == NULL)
 		panic("vm_map_entry_create: kernel resources exhausted");
 	return (new_entry);
 }
 
 /*
  *	vm_map_entry_set_behavior:
  *
  *	Set the expected access behavior, either normal, random, or
  *	sequential.
  */
 static inline void
 vm_map_entry_set_behavior(vm_map_entry_t entry, u_char behavior)
 {
 	entry->eflags = (entry->eflags & ~MAP_ENTRY_BEHAV_MASK) |
 	    (behavior & MAP_ENTRY_BEHAV_MASK);
 }
 
 /*
  *	vm_map_entry_max_free_{left,right}:
  *
  *	Compute the size of the largest free gap between two entries,
  *	one the root of a tree and the other the ancestor of that root
  *	that is the least or greatest ancestor found on the search path.
  */
 static inline vm_size_t
 vm_map_entry_max_free_left(vm_map_entry_t root, vm_map_entry_t left_ancestor)
 {
 
 	return (root->left != left_ancestor ?
 	    root->left->max_free : root->start - left_ancestor->end);
 }
 
 static inline vm_size_t
 vm_map_entry_max_free_right(vm_map_entry_t root, vm_map_entry_t right_ancestor)
 {
 
 	return (root->right != right_ancestor ?
 	    root->right->max_free : right_ancestor->start - root->end);
 }
 
 /*
  *	vm_map_entry_{pred,succ}:
  *
  *	Find the {predecessor, successor} of the entry by taking one step
  *	in the appropriate direction and backtracking as much as necessary.
  *	vm_map_entry_succ is defined in vm_map.h.
  */
 static inline vm_map_entry_t
 vm_map_entry_pred(vm_map_entry_t entry)
 {
 	vm_map_entry_t prior;
 
 	prior = entry->left;
 	if (prior->right->start < entry->start) {
 		do
 			prior = prior->right;
 		while (prior->right != entry);
 	}
 	return (prior);
 }
 
 static inline vm_size_t
 vm_size_max(vm_size_t a, vm_size_t b)
 {
 
 	return (a > b ? a : b);
 }
 
 #define SPLAY_LEFT_STEP(root, y, llist, rlist, test) do {		\
 	vm_map_entry_t z;						\
 	vm_size_t max_free;						\
 									\
 	/*								\
 	 * Infer root->right->max_free == root->max_free when		\
 	 * y->max_free < root->max_free || root->max_free == 0.		\
 	 * Otherwise, look right to find it.				\
 	 */								\
 	y = root->left;							\
 	max_free = root->max_free;					\
 	KASSERT(max_free == vm_size_max(				\
 	    vm_map_entry_max_free_left(root, llist),			\
 	    vm_map_entry_max_free_right(root, rlist)),			\
 	    ("%s: max_free invariant fails", __func__));		\
 	if (max_free - 1 < vm_map_entry_max_free_left(root, llist))	\
 		max_free = vm_map_entry_max_free_right(root, rlist);	\
 	if (y != llist && (test)) {					\
 		/* Rotate right and make y root. */			\
 		z = y->right;						\
 		if (z != root) {					\
 			root->left = z;					\
 			y->right = root;				\
 			if (max_free < y->max_free)			\
 			    root->max_free = max_free =			\
 			    vm_size_max(max_free, z->max_free);		\
 		} else if (max_free < y->max_free)			\
 			root->max_free = max_free =			\
 			    vm_size_max(max_free, root->start - y->end);\
 		root = y;						\
 		y = root->left;						\
 	}								\
 	/* Copy right->max_free.  Put root on rlist. */			\
 	root->max_free = max_free;					\
 	KASSERT(max_free == vm_map_entry_max_free_right(root, rlist),	\
 	    ("%s: max_free not copied from right", __func__));		\
 	root->left = rlist;						\
 	rlist = root;							\
 	root = y != llist ? y : NULL;					\
 } while (0)
 
 #define SPLAY_RIGHT_STEP(root, y, llist, rlist, test) do {		\
 	vm_map_entry_t z;						\
 	vm_size_t max_free;						\
 									\
 	/*								\
 	 * Infer root->left->max_free == root->max_free when		\
 	 * y->max_free < root->max_free || root->max_free == 0.		\
 	 * Otherwise, look left to find it.				\
 	 */								\
 	y = root->right;						\
 	max_free = root->max_free;					\
 	KASSERT(max_free == vm_size_max(				\
 	    vm_map_entry_max_free_left(root, llist),			\
 	    vm_map_entry_max_free_right(root, rlist)),			\
 	    ("%s: max_free invariant fails", __func__));		\
 	if (max_free - 1 < vm_map_entry_max_free_right(root, rlist))	\
 		max_free = vm_map_entry_max_free_left(root, llist);	\
 	if (y != rlist && (test)) {					\
 		/* Rotate left and make y root. */			\
 		z = y->left;						\
 		if (z != root) {					\
 			root->right = z;				\
 			y->left = root;					\
 			if (max_free < y->max_free)			\
 			    root->max_free = max_free =			\
 			    vm_size_max(max_free, z->max_free);		\
 		} else if (max_free < y->max_free)			\
 			root->max_free = max_free =			\
 			    vm_size_max(max_free, y->start - root->end);\
 		root = y;						\
 		y = root->right;					\
 	}								\
 	/* Copy left->max_free.  Put root on llist. */			\
 	root->max_free = max_free;					\
 	KASSERT(max_free == vm_map_entry_max_free_left(root, llist),	\
 	    ("%s: max_free not copied from left", __func__));		\
 	root->right = llist;						\
 	llist = root;							\
 	root = y != rlist ? y : NULL;					\
 } while (0)
 
 /*
  * Walk down the tree until we find addr or a gap where addr would go, breaking
  * off left and right subtrees of nodes less than, or greater than addr.  Treat
  * subtrees with root->max_free < length as empty trees.  llist and rlist are
  * the two sides in reverse order (bottom-up), with llist linked by the right
  * pointer and rlist linked by the left pointer in the vm_map_entry, and both
  * lists terminated by &map->header.  This function, and the subsequent call to
  * vm_map_splay_merge_{left,right,pred,succ}, rely on the start and end address
  * values in &map->header.
  */
 static __always_inline vm_map_entry_t
 vm_map_splay_split(vm_map_t map, vm_offset_t addr, vm_size_t length,
     vm_map_entry_t *llist, vm_map_entry_t *rlist)
 {
 	vm_map_entry_t left, right, root, y;
 
 	left = right = &map->header;
 	root = map->root;
 	while (root != NULL && root->max_free >= length) {
 		KASSERT(left->end <= root->start &&
 		    root->end <= right->start,
 		    ("%s: root not within tree bounds", __func__));
 		if (addr < root->start) {
 			SPLAY_LEFT_STEP(root, y, left, right,
 			    y->max_free >= length && addr < y->start);
 		} else if (addr >= root->end) {
 			SPLAY_RIGHT_STEP(root, y, left, right,
 			    y->max_free >= length && addr >= y->end);
 		} else
 			break;
 	}
 	*llist = left;
 	*rlist = right;
 	return (root);
 }
 
 static __always_inline void
 vm_map_splay_findnext(vm_map_entry_t root, vm_map_entry_t *rlist)
 {
 	vm_map_entry_t hi, right, y;
 
 	right = *rlist;
 	hi = root->right == right ? NULL : root->right;
 	if (hi == NULL)
 		return;
 	do
 		SPLAY_LEFT_STEP(hi, y, root, right, true);
 	while (hi != NULL);
 	*rlist = right;
 }
 
 static __always_inline void
 vm_map_splay_findprev(vm_map_entry_t root, vm_map_entry_t *llist)
 {
 	vm_map_entry_t left, lo, y;
 
 	left = *llist;
 	lo = root->left == left ? NULL : root->left;
 	if (lo == NULL)
 		return;
 	do
 		SPLAY_RIGHT_STEP(lo, y, left, root, true);
 	while (lo != NULL);
 	*llist = left;
 }
 
 static inline void
 vm_map_entry_swap(vm_map_entry_t *a, vm_map_entry_t *b)
 {
 	vm_map_entry_t tmp;
 
 	tmp = *b;
 	*b = *a;
 	*a = tmp;
 }
 
 /*
  * Walk back up the two spines, flip the pointers and set max_free.  The
  * subtrees of the root go at the bottom of llist and rlist.
  */
 static vm_size_t
 vm_map_splay_merge_left_walk(vm_map_entry_t header, vm_map_entry_t root,
     vm_map_entry_t tail, vm_size_t max_free, vm_map_entry_t llist)
 {
 	do {
 		/*
 		 * The max_free values of the children of llist are in
 		 * llist->max_free and max_free.  Update with the
 		 * max value.
 		 */
 		llist->max_free = max_free =
 		    vm_size_max(llist->max_free, max_free);
 		vm_map_entry_swap(&llist->right, &tail);
 		vm_map_entry_swap(&tail, &llist);
 	} while (llist != header);
 	root->left = tail;
 	return (max_free);
 }
 
 /*
  * When llist is known to be the predecessor of root.
  */
 static inline vm_size_t
 vm_map_splay_merge_pred(vm_map_entry_t header, vm_map_entry_t root,
     vm_map_entry_t llist)
 {
 	vm_size_t max_free;
 
 	max_free = root->start - llist->end;
 	if (llist != header) {
 		max_free = vm_map_splay_merge_left_walk(header, root,
 		    root, max_free, llist);
 	} else {
 		root->left = header;
 		header->right = root;
 	}
 	return (max_free);
 }
 
 /*
  * When llist may or may not be the predecessor of root.
  */
 static inline vm_size_t
 vm_map_splay_merge_left(vm_map_entry_t header, vm_map_entry_t root,
     vm_map_entry_t llist)
 {
 	vm_size_t max_free;
 
 	max_free = vm_map_entry_max_free_left(root, llist);
 	if (llist != header) {
 		max_free = vm_map_splay_merge_left_walk(header, root,
 		    root->left == llist ? root : root->left,
 		    max_free, llist);
 	}
 	return (max_free);
 }
 
 static vm_size_t
 vm_map_splay_merge_right_walk(vm_map_entry_t header, vm_map_entry_t root,
     vm_map_entry_t tail, vm_size_t max_free, vm_map_entry_t rlist)
 {
 	do {
 		/*
 		 * The max_free values of the children of rlist are in
 		 * rlist->max_free and max_free.  Update with the
 		 * max value.
 		 */
 		rlist->max_free = max_free =
 		    vm_size_max(rlist->max_free, max_free);
 		vm_map_entry_swap(&rlist->left, &tail);
 		vm_map_entry_swap(&tail, &rlist);
 	} while (rlist != header);
 	root->right = tail;
 	return (max_free);
 }
 
 /*
  * When rlist is known to be the succecessor of root.
  */
 static inline vm_size_t
 vm_map_splay_merge_succ(vm_map_entry_t header, vm_map_entry_t root,
     vm_map_entry_t rlist)
 {
 	vm_size_t max_free;
 
 	max_free = rlist->start - root->end;
 	if (rlist != header) {
 		max_free = vm_map_splay_merge_right_walk(header, root,
 		    root, max_free, rlist);
 	} else {
 		root->right = header;
 		header->left = root;
 	}
 	return (max_free);
 }
 
 /*
  * When rlist may or may not be the succecessor of root.
  */
 static inline vm_size_t
 vm_map_splay_merge_right(vm_map_entry_t header, vm_map_entry_t root,
     vm_map_entry_t rlist)
 {
 	vm_size_t max_free;
 
 	max_free = vm_map_entry_max_free_right(root, rlist);
 	if (rlist != header) {
 		max_free = vm_map_splay_merge_right_walk(header, root,
 		    root->right == rlist ? root : root->right,
 		    max_free, rlist);
 	}
 	return (max_free);
 }
 
 /*
  *	vm_map_splay:
  *
  *	The Sleator and Tarjan top-down splay algorithm with the
  *	following variation.  Max_free must be computed bottom-up, so
  *	on the downward pass, maintain the left and right spines in
  *	reverse order.  Then, make a second pass up each side to fix
  *	the pointers and compute max_free.  The time bound is O(log n)
  *	amortized.
  *
  *	The tree is threaded, which means that there are no null pointers.
  *	When a node has no left child, its left pointer points to its
  *	predecessor, which the last ancestor on the search path from the root
  *	where the search branched right.  Likewise, when a node has no right
  *	child, its right pointer points to its successor.  The map header node
  *	is the predecessor of the first map entry, and the successor of the
  *	last.
  *
  *	The new root is the vm_map_entry containing "addr", or else an
  *	adjacent entry (lower if possible) if addr is not in the tree.
  *
  *	The map must be locked, and leaves it so.
  *
  *	Returns: the new root.
  */
 static vm_map_entry_t
 vm_map_splay(vm_map_t map, vm_offset_t addr)
 {
 	vm_map_entry_t header, llist, rlist, root;
 	vm_size_t max_free_left, max_free_right;
 
 	header = &map->header;
 	root = vm_map_splay_split(map, addr, 0, &llist, &rlist);
 	if (root != NULL) {
 		max_free_left = vm_map_splay_merge_left(header, root, llist);
 		max_free_right = vm_map_splay_merge_right(header, root, rlist);
 	} else if (llist != header) {
 		/*
 		 * Recover the greatest node in the left
 		 * subtree and make it the root.
 		 */
 		root = llist;
 		llist = root->right;
 		max_free_left = vm_map_splay_merge_left(header, root, llist);
 		max_free_right = vm_map_splay_merge_succ(header, root, rlist);
 	} else if (rlist != header) {
 		/*
 		 * Recover the least node in the right
 		 * subtree and make it the root.
 		 */
 		root = rlist;
 		rlist = root->left;
 		max_free_left = vm_map_splay_merge_pred(header, root, llist);
 		max_free_right = vm_map_splay_merge_right(header, root, rlist);
 	} else {
 		/* There is no root. */
 		return (NULL);
 	}
 	root->max_free = vm_size_max(max_free_left, max_free_right);
 	map->root = root;
 	VM_MAP_ASSERT_CONSISTENT(map);
 	return (root);
 }
 
 /*
  *	vm_map_entry_{un,}link:
  *
  *	Insert/remove entries from maps.  On linking, if new entry clips
  *	existing entry, trim existing entry to avoid overlap, and manage
  *	offsets.  On unlinking, merge disappearing entry with neighbor, if
  *	called for, and manage offsets.  Callers should not modify fields in
  *	entries already mapped.
  */
 static void
 vm_map_entry_link(vm_map_t map, vm_map_entry_t entry)
 {
 	vm_map_entry_t header, llist, rlist, root;
 	vm_size_t max_free_left, max_free_right;
 
 	CTR3(KTR_VM,
 	    "vm_map_entry_link: map %p, nentries %d, entry %p", map,
 	    map->nentries, entry);
 	VM_MAP_ASSERT_LOCKED(map);
 	map->nentries++;
 	header = &map->header;
 	root = vm_map_splay_split(map, entry->start, 0, &llist, &rlist);
 	if (root == NULL) {
 		/*
 		 * The new entry does not overlap any existing entry in the
 		 * map, so it becomes the new root of the map tree.
 		 */
 		max_free_left = vm_map_splay_merge_pred(header, entry, llist);
 		max_free_right = vm_map_splay_merge_succ(header, entry, rlist);
 	} else if (entry->start == root->start) {
 		/*
 		 * The new entry is a clone of root, with only the end field
 		 * changed.  The root entry will be shrunk to abut the new
 		 * entry, and will be the right child of the new root entry in
 		 * the modified map.
 		 */
 		KASSERT(entry->end < root->end,
 		    ("%s: clip_start not within entry", __func__));
 		vm_map_splay_findprev(root, &llist);
 		root->offset += entry->end - root->start;
 		root->start = entry->end;
 		max_free_left = vm_map_splay_merge_pred(header, entry, llist);
 		max_free_right = root->max_free = vm_size_max(
 		    vm_map_splay_merge_pred(entry, root, entry),
 		    vm_map_splay_merge_right(header, root, rlist));
 	} else {
 		/*
 		 * The new entry is a clone of root, with only the start field
 		 * changed.  The root entry will be shrunk to abut the new
 		 * entry, and will be the left child of the new root entry in
 		 * the modified map.
 		 */
 		KASSERT(entry->end == root->end,
 		    ("%s: clip_start not within entry", __func__));
 		vm_map_splay_findnext(root, &rlist);
 		entry->offset += entry->start - root->start;
 		root->end = entry->start;
 		max_free_left = root->max_free = vm_size_max(
 		    vm_map_splay_merge_left(header, root, llist),
 		    vm_map_splay_merge_succ(entry, root, entry));
 		max_free_right = vm_map_splay_merge_succ(header, entry, rlist);
 	}
 	entry->max_free = vm_size_max(max_free_left, max_free_right);
 	map->root = entry;
 	VM_MAP_ASSERT_CONSISTENT(map);
 }
 
 enum unlink_merge_type {
 	UNLINK_MERGE_NONE,
 	UNLINK_MERGE_NEXT
 };
 
 static void
 vm_map_entry_unlink(vm_map_t map, vm_map_entry_t entry,
     enum unlink_merge_type op)
 {
 	vm_map_entry_t header, llist, rlist, root;
 	vm_size_t max_free_left, max_free_right;
 
 	VM_MAP_ASSERT_LOCKED(map);
 	header = &map->header;
 	root = vm_map_splay_split(map, entry->start, 0, &llist, &rlist);
 	KASSERT(root != NULL,
 	    ("vm_map_entry_unlink: unlink object not mapped"));
 
 	vm_map_splay_findprev(root, &llist);
 	vm_map_splay_findnext(root, &rlist);
 	if (op == UNLINK_MERGE_NEXT) {
 		rlist->start = root->start;
 		rlist->offset = root->offset;
 	}
 	if (llist != header) {
 		root = llist;
 		llist = root->right;
 		max_free_left = vm_map_splay_merge_left(header, root, llist);
 		max_free_right = vm_map_splay_merge_succ(header, root, rlist);
 	} else if (rlist != header) {
 		root = rlist;
 		rlist = root->left;
 		max_free_left = vm_map_splay_merge_pred(header, root, llist);
 		max_free_right = vm_map_splay_merge_right(header, root, rlist);
 	} else {
 		header->left = header->right = header;
 		root = NULL;
 	}
 	if (root != NULL)
 		root->max_free = vm_size_max(max_free_left, max_free_right);
 	map->root = root;
 	VM_MAP_ASSERT_CONSISTENT(map);
 	map->nentries--;
 	CTR3(KTR_VM, "vm_map_entry_unlink: map %p, nentries %d, entry %p", map,
 	    map->nentries, entry);
 }
 
 /*
  *	vm_map_entry_resize:
  *
  *	Resize a vm_map_entry, recompute the amount of free space that
  *	follows it and propagate that value up the tree.
  *
  *	The map must be locked, and leaves it so.
  */
 static void
 vm_map_entry_resize(vm_map_t map, vm_map_entry_t entry, vm_size_t grow_amount)
 {
 	vm_map_entry_t header, llist, rlist, root;
 
 	VM_MAP_ASSERT_LOCKED(map);
 	header = &map->header;
 	root = vm_map_splay_split(map, entry->start, 0, &llist, &rlist);
 	KASSERT(root != NULL, ("%s: resize object not mapped", __func__));
 	vm_map_splay_findnext(root, &rlist);
 	entry->end += grow_amount;
 	root->max_free = vm_size_max(
 	    vm_map_splay_merge_left(header, root, llist),
 	    vm_map_splay_merge_succ(header, root, rlist));
 	map->root = root;
 	VM_MAP_ASSERT_CONSISTENT(map);
 	CTR4(KTR_VM, "%s: map %p, nentries %d, entry %p",
 	    __func__, map, map->nentries, entry);
 }
 
 /*
  *	vm_map_lookup_entry:	[ internal use only ]
  *
  *	Finds the map entry containing (or
  *	immediately preceding) the specified address
  *	in the given map; the entry is returned
  *	in the "entry" parameter.  The boolean
  *	result indicates whether the address is
  *	actually contained in the map.
  */
 boolean_t
 vm_map_lookup_entry(
 	vm_map_t map,
 	vm_offset_t address,
 	vm_map_entry_t *entry)	/* OUT */
 {
 	vm_map_entry_t cur, header, lbound, ubound;
 	boolean_t locked;
 
 	/*
 	 * If the map is empty, then the map entry immediately preceding
 	 * "address" is the map's header.
 	 */
 	header = &map->header;
 	cur = map->root;
 	if (cur == NULL) {
 		*entry = header;
 		return (FALSE);
 	}
 	if (address >= cur->start && cur->end > address) {
 		*entry = cur;
 		return (TRUE);
 	}
 	if ((locked = vm_map_locked(map)) ||
 	    sx_try_upgrade(&map->lock)) {
 		/*
 		 * Splay requires a write lock on the map.  However, it only
 		 * restructures the binary search tree; it does not otherwise
 		 * change the map.  Thus, the map's timestamp need not change
 		 * on a temporary upgrade.
 		 */
 		cur = vm_map_splay(map, address);
 		if (!locked) {
 			VM_MAP_UNLOCK_CONSISTENT(map);
 			sx_downgrade(&map->lock);
 		}
 
 		/*
 		 * If "address" is contained within a map entry, the new root
 		 * is that map entry.  Otherwise, the new root is a map entry
 		 * immediately before or after "address".
 		 */
 		if (address < cur->start) {
 			*entry = header;
 			return (FALSE);
 		}
 		*entry = cur;
 		return (address < cur->end);
 	}
 	/*
 	 * Since the map is only locked for read access, perform a
 	 * standard binary search tree lookup for "address".
 	 */
 	lbound = ubound = header;
 	for (;;) {
 		if (address < cur->start) {
 			ubound = cur;
 			cur = cur->left;
 			if (cur == lbound)
 				break;
 		} else if (cur->end <= address) {
 			lbound = cur;
 			cur = cur->right;
 			if (cur == ubound)
 				break;
 		} else {
 			*entry = cur;
 			return (TRUE);
 		}
 	}
 	*entry = lbound;
 	return (FALSE);
 }
 
 /*
  *	vm_map_insert:
  *
  *	Inserts the given whole VM object into the target
  *	map at the specified address range.  The object's
  *	size should match that of the address range.
  *
  *	Requires that the map be locked, and leaves it so.
  *
  *	If object is non-NULL, ref count must be bumped by caller
  *	prior to making call to account for the new entry.
  */
 int
 vm_map_insert(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
     vm_offset_t start, vm_offset_t end, vm_prot_t prot, vm_prot_t max, int cow)
 {
 	vm_map_entry_t new_entry, next_entry, prev_entry;
 	struct ucred *cred;
 	vm_eflags_t protoeflags;
 	vm_inherit_t inheritance;
 
 	VM_MAP_ASSERT_LOCKED(map);
 	KASSERT(object != kernel_object ||
 	    (cow & MAP_COPY_ON_WRITE) == 0,
 	    ("vm_map_insert: kernel object and COW"));
 	KASSERT(object == NULL || (cow & MAP_NOFAULT) == 0,
 	    ("vm_map_insert: paradoxical MAP_NOFAULT request"));
 	KASSERT((prot & ~max) == 0,
 	    ("prot %#x is not subset of max_prot %#x", prot, max));
 
 	/*
 	 * Check that the start and end points are not bogus.
 	 */
 	if (start == end || !vm_map_range_valid(map, start, end))
 		return (KERN_INVALID_ADDRESS);
 
 	/*
 	 * Find the entry prior to the proposed starting address; if it's part
 	 * of an existing entry, this range is bogus.
 	 */
 	if (vm_map_lookup_entry(map, start, &prev_entry))
 		return (KERN_NO_SPACE);
 
 	/*
 	 * Assert that the next entry doesn't overlap the end point.
 	 */
 	next_entry = vm_map_entry_succ(prev_entry);
 	if (next_entry->start < end)
 		return (KERN_NO_SPACE);
 
 	if ((cow & MAP_CREATE_GUARD) != 0 && (object != NULL ||
 	    max != VM_PROT_NONE))
 		return (KERN_INVALID_ARGUMENT);
 
 	protoeflags = 0;
 	if (cow & MAP_COPY_ON_WRITE)
 		protoeflags |= MAP_ENTRY_COW | MAP_ENTRY_NEEDS_COPY;
 	if (cow & MAP_NOFAULT)
 		protoeflags |= MAP_ENTRY_NOFAULT;
 	if (cow & MAP_DISABLE_SYNCER)
 		protoeflags |= MAP_ENTRY_NOSYNC;
 	if (cow & MAP_DISABLE_COREDUMP)
 		protoeflags |= MAP_ENTRY_NOCOREDUMP;
 	if (cow & MAP_STACK_GROWS_DOWN)
 		protoeflags |= MAP_ENTRY_GROWS_DOWN;
 	if (cow & MAP_STACK_GROWS_UP)
 		protoeflags |= MAP_ENTRY_GROWS_UP;
 	if (cow & MAP_WRITECOUNT)
 		protoeflags |= MAP_ENTRY_WRITECNT;
 	if (cow & MAP_VN_EXEC)
 		protoeflags |= MAP_ENTRY_VN_EXEC;
 	if ((cow & MAP_CREATE_GUARD) != 0)
 		protoeflags |= MAP_ENTRY_GUARD;
 	if ((cow & MAP_CREATE_STACK_GAP_DN) != 0)
 		protoeflags |= MAP_ENTRY_STACK_GAP_DN;
 	if ((cow & MAP_CREATE_STACK_GAP_UP) != 0)
 		protoeflags |= MAP_ENTRY_STACK_GAP_UP;
 	if (cow & MAP_INHERIT_SHARE)
 		inheritance = VM_INHERIT_SHARE;
 	else
 		inheritance = VM_INHERIT_DEFAULT;
 
 	cred = NULL;
 	if ((cow & (MAP_ACC_NO_CHARGE | MAP_NOFAULT | MAP_CREATE_GUARD)) != 0)
 		goto charged;
 	if ((cow & MAP_ACC_CHARGED) || ((prot & VM_PROT_WRITE) &&
 	    ((protoeflags & MAP_ENTRY_NEEDS_COPY) || object == NULL))) {
 		if (!(cow & MAP_ACC_CHARGED) && !swap_reserve(end - start))
 			return (KERN_RESOURCE_SHORTAGE);
 		KASSERT(object == NULL ||
 		    (protoeflags & MAP_ENTRY_NEEDS_COPY) != 0 ||
 		    object->cred == NULL,
 		    ("overcommit: vm_map_insert o %p", object));
 		cred = curthread->td_ucred;
 	}
 
 charged:
 	/* Expand the kernel pmap, if necessary. */
 	if (map == kernel_map && end > kernel_vm_end)
 		pmap_growkernel(end);
 	if (object != NULL) {
 		/*
 		 * OBJ_ONEMAPPING must be cleared unless this mapping
 		 * is trivially proven to be the only mapping for any
 		 * of the object's pages.  (Object granularity
 		 * reference counting is insufficient to recognize
 		 * aliases with precision.)
 		 */
 		if ((object->flags & OBJ_ANON) != 0) {
 			VM_OBJECT_WLOCK(object);
 			if (object->ref_count > 1 || object->shadow_count != 0)
 				vm_object_clear_flag(object, OBJ_ONEMAPPING);
 			VM_OBJECT_WUNLOCK(object);
 		}
 	} else if ((prev_entry->eflags & ~MAP_ENTRY_USER_WIRED) ==
 	    protoeflags &&
 	    (cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP |
 	    MAP_VN_EXEC)) == 0 &&
 	    prev_entry->end == start && (prev_entry->cred == cred ||
 	    (prev_entry->object.vm_object != NULL &&
 	    prev_entry->object.vm_object->cred == cred)) &&
 	    vm_object_coalesce(prev_entry->object.vm_object,
 	    prev_entry->offset,
 	    (vm_size_t)(prev_entry->end - prev_entry->start),
 	    (vm_size_t)(end - prev_entry->end), cred != NULL &&
 	    (protoeflags & MAP_ENTRY_NEEDS_COPY) == 0)) {
 		/*
 		 * We were able to extend the object.  Determine if we
 		 * can extend the previous map entry to include the
 		 * new range as well.
 		 */
 		if (prev_entry->inheritance == inheritance &&
 		    prev_entry->protection == prot &&
 		    prev_entry->max_protection == max &&
 		    prev_entry->wired_count == 0) {
 			KASSERT((prev_entry->eflags & MAP_ENTRY_USER_WIRED) ==
 			    0, ("prev_entry %p has incoherent wiring",
 			    prev_entry));
 			if ((prev_entry->eflags & MAP_ENTRY_GUARD) == 0)
 				map->size += end - prev_entry->end;
 			vm_map_entry_resize(map, prev_entry,
 			    end - prev_entry->end);
 			vm_map_try_merge_entries(map, prev_entry, next_entry);
 			return (KERN_SUCCESS);
 		}
 
 		/*
 		 * If we can extend the object but cannot extend the
 		 * map entry, we have to create a new map entry.  We
 		 * must bump the ref count on the extended object to
 		 * account for it.  object may be NULL.
 		 */
 		object = prev_entry->object.vm_object;
 		offset = prev_entry->offset +
 		    (prev_entry->end - prev_entry->start);
 		vm_object_reference(object);
 		if (cred != NULL && object != NULL && object->cred != NULL &&
 		    !(prev_entry->eflags & MAP_ENTRY_NEEDS_COPY)) {
 			/* Object already accounts for this uid. */
 			cred = NULL;
 		}
 	}
 	if (cred != NULL)
 		crhold(cred);
 
 	/*
 	 * Create a new entry
 	 */
 	new_entry = vm_map_entry_create(map);
 	new_entry->start = start;
 	new_entry->end = end;
 	new_entry->cred = NULL;
 
 	new_entry->eflags = protoeflags;
 	new_entry->object.vm_object = object;
 	new_entry->offset = offset;
 
 	new_entry->inheritance = inheritance;
 	new_entry->protection = prot;
 	new_entry->max_protection = max;
 	new_entry->wired_count = 0;
 	new_entry->wiring_thread = NULL;
 	new_entry->read_ahead = VM_FAULT_READ_AHEAD_INIT;
 	new_entry->next_read = start;
 
 	KASSERT(cred == NULL || !ENTRY_CHARGED(new_entry),
 	    ("overcommit: vm_map_insert leaks vm_map %p", new_entry));
 	new_entry->cred = cred;
 
 	/*
 	 * Insert the new entry into the list
 	 */
 	vm_map_entry_link(map, new_entry);
 	if ((new_entry->eflags & MAP_ENTRY_GUARD) == 0)
 		map->size += new_entry->end - new_entry->start;
 
 	/*
 	 * Try to coalesce the new entry with both the previous and next
 	 * entries in the list.  Previously, we only attempted to coalesce
 	 * with the previous entry when object is NULL.  Here, we handle the
 	 * other cases, which are less common.
 	 */
 	vm_map_try_merge_entries(map, prev_entry, new_entry);
 	vm_map_try_merge_entries(map, new_entry, next_entry);
 
 	if ((cow & (MAP_PREFAULT | MAP_PREFAULT_PARTIAL)) != 0) {
 		vm_map_pmap_enter(map, start, prot, object, OFF_TO_IDX(offset),
 		    end - start, cow & MAP_PREFAULT_PARTIAL);
 	}
 
 	return (KERN_SUCCESS);
 }
 
 /*
  *	vm_map_findspace:
  *
  *	Find the first fit (lowest VM address) for "length" free bytes
  *	beginning at address >= start in the given map.
  *
  *	In a vm_map_entry, "max_free" is the maximum amount of
  *	contiguous free space between an entry in its subtree and a
  *	neighbor of that entry.  This allows finding a free region in
  *	one path down the tree, so O(log n) amortized with splay
  *	trees.
  *
  *	The map must be locked, and leaves it so.
  *
  *	Returns: starting address if sufficient space,
  *		 vm_map_max(map)-length+1 if insufficient space.
  */
 vm_offset_t
 vm_map_findspace(vm_map_t map, vm_offset_t start, vm_size_t length)
 {
 	vm_map_entry_t header, llist, rlist, root, y;
 	vm_size_t left_length, max_free_left, max_free_right;
 	vm_offset_t gap_end;
 
 	/*
 	 * Request must fit within min/max VM address and must avoid
 	 * address wrap.
 	 */
 	start = MAX(start, vm_map_min(map));
 	if (start >= vm_map_max(map) || length > vm_map_max(map) - start)
 		return (vm_map_max(map) - length + 1);
 
 	/* Empty tree means wide open address space. */
 	if (map->root == NULL)
 		return (start);
 
 	/*
 	 * After splay_split, if start is within an entry, push it to the start
 	 * of the following gap.  If rlist is at the end of the gap containing
 	 * start, save the end of that gap in gap_end to see if the gap is big
 	 * enough; otherwise set gap_end to start skip gap-checking and move
 	 * directly to a search of the right subtree.
 	 */
 	header = &map->header;
 	root = vm_map_splay_split(map, start, length, &llist, &rlist);
 	gap_end = rlist->start;
 	if (root != NULL) {
 		start = root->end;
 		if (root->right != rlist)
 			gap_end = start;
 		max_free_left = vm_map_splay_merge_left(header, root, llist);
 		max_free_right = vm_map_splay_merge_right(header, root, rlist);
 	} else if (rlist != header) {
 		root = rlist;
 		rlist = root->left;
 		max_free_left = vm_map_splay_merge_pred(header, root, llist);
 		max_free_right = vm_map_splay_merge_right(header, root, rlist);
 	} else {
 		root = llist;
 		llist = root->right;
 		max_free_left = vm_map_splay_merge_left(header, root, llist);
 		max_free_right = vm_map_splay_merge_succ(header, root, rlist);
 	}
 	root->max_free = vm_size_max(max_free_left, max_free_right);
 	map->root = root;
 	VM_MAP_ASSERT_CONSISTENT(map);
 	if (length <= gap_end - start)
 		return (start);
 
 	/* With max_free, can immediately tell if no solution. */
 	if (root->right == header || length > root->right->max_free)
 		return (vm_map_max(map) - length + 1);
 
 	/*
 	 * Splay for the least large-enough gap in the right subtree.
 	 */
 	llist = rlist = header;
 	for (left_length = 0;;
 	    left_length = vm_map_entry_max_free_left(root, llist)) {
 		if (length <= left_length)
 			SPLAY_LEFT_STEP(root, y, llist, rlist,
 			    length <= vm_map_entry_max_free_left(y, llist));
 		else
 			SPLAY_RIGHT_STEP(root, y, llist, rlist,
 			    length > vm_map_entry_max_free_left(y, root));
 		if (root == NULL)
 			break;
 	}
 	root = llist;
 	llist = root->right;
 	max_free_left = vm_map_splay_merge_left(header, root, llist);
 	if (rlist == header) {
 		root->max_free = vm_size_max(max_free_left,
 		    vm_map_splay_merge_succ(header, root, rlist));
 	} else {
 		y = rlist;
 		rlist = y->left;
 		y->max_free = vm_size_max(
 		    vm_map_splay_merge_pred(root, y, root),
 		    vm_map_splay_merge_right(header, y, rlist));
 		root->max_free = vm_size_max(max_free_left, y->max_free);
 	}
 	map->root = root;
 	VM_MAP_ASSERT_CONSISTENT(map);
 	return (root->end);
 }
 
 int
 vm_map_fixed(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
     vm_offset_t start, vm_size_t length, vm_prot_t prot,
     vm_prot_t max, int cow)
 {
 	vm_offset_t end;
 	int result;
 
 	end = start + length;
 	KASSERT((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0 ||
 	    object == NULL,
 	    ("vm_map_fixed: non-NULL backing object for stack"));
 	vm_map_lock(map);
 	VM_MAP_RANGE_CHECK(map, start, end);
 	if ((cow & MAP_CHECK_EXCL) == 0)
 		vm_map_delete(map, start, end);
 	if ((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) != 0) {
 		result = vm_map_stack_locked(map, start, length, sgrowsiz,
 		    prot, max, cow);
 	} else {
 		result = vm_map_insert(map, object, offset, start, end,
 		    prot, max, cow);
 	}
 	vm_map_unlock(map);
 	return (result);
 }
 
 static const int aslr_pages_rnd_64[2] = {0x1000, 0x10};
 static const int aslr_pages_rnd_32[2] = {0x100, 0x4};
 
 static int cluster_anon = 1;
 SYSCTL_INT(_vm, OID_AUTO, cluster_anon, CTLFLAG_RW,
     &cluster_anon, 0,
     "Cluster anonymous mappings: 0 = no, 1 = yes if no hint, 2 = always");
 
 static bool
 clustering_anon_allowed(vm_offset_t addr)
 {
 
 	switch (cluster_anon) {
 	case 0:
 		return (false);
 	case 1:
 		return (addr == 0);
 	case 2:
 	default:
 		return (true);
 	}
 }
 
 static long aslr_restarts;
 SYSCTL_LONG(_vm, OID_AUTO, aslr_restarts, CTLFLAG_RD,
     &aslr_restarts, 0,
     "Number of aslr failures");
 
 #define	MAP_32BIT_MAX_ADDR	((vm_offset_t)1 << 31)
 
 /*
  * Searches for the specified amount of free space in the given map with the
  * specified alignment.  Performs an address-ordered, first-fit search from
  * the given address "*addr", with an optional upper bound "max_addr".  If the
  * parameter "alignment" is zero, then the alignment is computed from the
  * given (object, offset) pair so as to enable the greatest possible use of
  * superpage mappings.  Returns KERN_SUCCESS and the address of the free space
  * in "*addr" if successful.  Otherwise, returns KERN_NO_SPACE.
  *
  * The map must be locked.  Initially, there must be at least "length" bytes
  * of free space at the given address.
  */
 static int
 vm_map_alignspace(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
     vm_offset_t *addr, vm_size_t length, vm_offset_t max_addr,
     vm_offset_t alignment)
 {
 	vm_offset_t aligned_addr, free_addr;
 
 	VM_MAP_ASSERT_LOCKED(map);
 	free_addr = *addr;
 	KASSERT(free_addr == vm_map_findspace(map, free_addr, length),
 	    ("caller failed to provide space %#jx at address %p",
 	     (uintmax_t)length, (void *)free_addr));
 	for (;;) {
 		/*
 		 * At the start of every iteration, the free space at address
 		 * "*addr" is at least "length" bytes.
 		 */
 		if (alignment == 0)
 			pmap_align_superpage(object, offset, addr, length);
 		else if ((*addr & (alignment - 1)) != 0) {
 			*addr &= ~(alignment - 1);
 			*addr += alignment;
 		}
 		aligned_addr = *addr;
 		if (aligned_addr == free_addr) {
 			/*
 			 * Alignment did not change "*addr", so "*addr" must
 			 * still provide sufficient free space.
 			 */
 			return (KERN_SUCCESS);
 		}
 
 		/*
 		 * Test for address wrap on "*addr".  A wrapped "*addr" could
 		 * be a valid address, in which case vm_map_findspace() cannot
 		 * be relied upon to fail.
 		 */
 		if (aligned_addr < free_addr)
 			return (KERN_NO_SPACE);
 		*addr = vm_map_findspace(map, aligned_addr, length);
 		if (*addr + length > vm_map_max(map) ||
 		    (max_addr != 0 && *addr + length > max_addr))
 			return (KERN_NO_SPACE);
 		free_addr = *addr;
 		if (free_addr == aligned_addr) {
 			/*
 			 * If a successful call to vm_map_findspace() did not
 			 * change "*addr", then "*addr" must still be aligned
 			 * and provide sufficient free space.
 			 */
 			return (KERN_SUCCESS);
 		}
 	}
 }
 
 /*
  *	vm_map_find finds an unallocated region in the target address
  *	map with the given length.  The search is defined to be
  *	first-fit from the specified address; the region found is
  *	returned in the same parameter.
  *
  *	If object is non-NULL, ref count must be bumped by caller
  *	prior to making call to account for the new entry.
  */
 int
 vm_map_find(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
 	    vm_offset_t *addr,	/* IN/OUT */
 	    vm_size_t length, vm_offset_t max_addr, int find_space,
 	    vm_prot_t prot, vm_prot_t max, int cow)
 {
 	vm_offset_t alignment, curr_min_addr, min_addr;
 	int gap, pidx, rv, try;
 	bool cluster, en_aslr, update_anon;
 
 	KASSERT((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0 ||
 	    object == NULL,
 	    ("vm_map_find: non-NULL backing object for stack"));
 	MPASS((cow & MAP_REMAP) == 0 || (find_space == VMFS_NO_SPACE &&
 	    (cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0));
 	if (find_space == VMFS_OPTIMAL_SPACE && (object == NULL ||
 	    (object->flags & OBJ_COLORED) == 0))
 		find_space = VMFS_ANY_SPACE;
 	if (find_space >> 8 != 0) {
 		KASSERT((find_space & 0xff) == 0, ("bad VMFS flags"));
 		alignment = (vm_offset_t)1 << (find_space >> 8);
 	} else
 		alignment = 0;
 	en_aslr = (map->flags & MAP_ASLR) != 0;
 	update_anon = cluster = clustering_anon_allowed(*addr) &&
 	    (map->flags & MAP_IS_SUB_MAP) == 0 && max_addr == 0 &&
 	    find_space != VMFS_NO_SPACE && object == NULL &&
 	    (cow & (MAP_INHERIT_SHARE | MAP_STACK_GROWS_UP |
 	    MAP_STACK_GROWS_DOWN)) == 0 && prot != PROT_NONE;
 	curr_min_addr = min_addr = *addr;
 	if (en_aslr && min_addr == 0 && !cluster &&
 	    find_space != VMFS_NO_SPACE &&
 	    (map->flags & MAP_ASLR_IGNSTART) != 0)
 		curr_min_addr = min_addr = vm_map_min(map);
 	try = 0;
 	vm_map_lock(map);
 	if (cluster) {
 		curr_min_addr = map->anon_loc;
 		if (curr_min_addr == 0)
 			cluster = false;
 	}
 	if (find_space != VMFS_NO_SPACE) {
 		KASSERT(find_space == VMFS_ANY_SPACE ||
 		    find_space == VMFS_OPTIMAL_SPACE ||
 		    find_space == VMFS_SUPER_SPACE ||
 		    alignment != 0, ("unexpected VMFS flag"));
 again:
 		/*
 		 * When creating an anonymous mapping, try clustering
 		 * with an existing anonymous mapping first.
 		 *
 		 * We make up to two attempts to find address space
 		 * for a given find_space value. The first attempt may
 		 * apply randomization or may cluster with an existing
 		 * anonymous mapping. If this first attempt fails,
 		 * perform a first-fit search of the available address
 		 * space.
 		 *
 		 * If all tries failed, and find_space is
 		 * VMFS_OPTIMAL_SPACE, fallback to VMFS_ANY_SPACE.
 		 * Again enable clustering and randomization.
 		 */
 		try++;
 		MPASS(try <= 2);
 
 		if (try == 2) {
 			/*
 			 * Second try: we failed either to find a
 			 * suitable region for randomizing the
 			 * allocation, or to cluster with an existing
 			 * mapping.  Retry with free run.
 			 */
 			curr_min_addr = (map->flags & MAP_ASLR_IGNSTART) != 0 ?
 			    vm_map_min(map) : min_addr;
 			atomic_add_long(&aslr_restarts, 1);
 		}
 
 		if (try == 1 && en_aslr && !cluster) {
 			/*
 			 * Find space for allocation, including
 			 * gap needed for later randomization.
 			 */
 			pidx = MAXPAGESIZES > 1 && pagesizes[1] != 0 &&
 			    (find_space == VMFS_SUPER_SPACE || find_space ==
 			    VMFS_OPTIMAL_SPACE) ? 1 : 0;
 			gap = vm_map_max(map) > MAP_32BIT_MAX_ADDR &&
 			    (max_addr == 0 || max_addr > MAP_32BIT_MAX_ADDR) ?
 			    aslr_pages_rnd_64[pidx] : aslr_pages_rnd_32[pidx];
 			*addr = vm_map_findspace(map, curr_min_addr,
 			    length + gap * pagesizes[pidx]);
 			if (*addr + length + gap * pagesizes[pidx] >
 			    vm_map_max(map))
 				goto again;
 			/* And randomize the start address. */
 			*addr += (arc4random() % gap) * pagesizes[pidx];
 			if (max_addr != 0 && *addr + length > max_addr)
 				goto again;
 		} else {
 			*addr = vm_map_findspace(map, curr_min_addr, length);
 			if (*addr + length > vm_map_max(map) ||
 			    (max_addr != 0 && *addr + length > max_addr)) {
 				if (cluster) {
 					cluster = false;
 					MPASS(try == 1);
 					goto again;
 				}
 				rv = KERN_NO_SPACE;
 				goto done;
 			}
 		}
 
 		if (find_space != VMFS_ANY_SPACE &&
 		    (rv = vm_map_alignspace(map, object, offset, addr, length,
 		    max_addr, alignment)) != KERN_SUCCESS) {
 			if (find_space == VMFS_OPTIMAL_SPACE) {
 				find_space = VMFS_ANY_SPACE;
 				curr_min_addr = min_addr;
 				cluster = update_anon;
 				try = 0;
 				goto again;
 			}
 			goto done;
 		}
 	} else if ((cow & MAP_REMAP) != 0) {
 		if (!vm_map_range_valid(map, *addr, *addr + length)) {
 			rv = KERN_INVALID_ADDRESS;
 			goto done;
 		}
 		vm_map_delete(map, *addr, *addr + length);
 	}
 	if ((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) != 0) {
 		rv = vm_map_stack_locked(map, *addr, length, sgrowsiz, prot,
 		    max, cow);
 	} else {
 		rv = vm_map_insert(map, object, offset, *addr, *addr + length,
 		    prot, max, cow);
 	}
 	if (rv == KERN_SUCCESS && update_anon)
 		map->anon_loc = *addr + length;
 done:
 	vm_map_unlock(map);
 	return (rv);
 }
 
 /*
  *	vm_map_find_min() is a variant of vm_map_find() that takes an
  *	additional parameter (min_addr) and treats the given address
  *	(*addr) differently.  Specifically, it treats *addr as a hint
  *	and not as the minimum address where the mapping is created.
  *
  *	This function works in two phases.  First, it tries to
  *	allocate above the hint.  If that fails and the hint is
  *	greater than min_addr, it performs a second pass, replacing
  *	the hint with min_addr as the minimum address for the
  *	allocation.
  */
 int
 vm_map_find_min(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
     vm_offset_t *addr, vm_size_t length, vm_offset_t min_addr,
     vm_offset_t max_addr, int find_space, vm_prot_t prot, vm_prot_t max,
     int cow)
 {
 	vm_offset_t hint;
 	int rv;
 
 	hint = *addr;
 	for (;;) {
 		rv = vm_map_find(map, object, offset, addr, length, max_addr,
 		    find_space, prot, max, cow);
 		if (rv == KERN_SUCCESS || min_addr >= hint)
 			return (rv);
 		*addr = hint = min_addr;
 	}
 }
 
 /*
  * A map entry with any of the following flags set must not be merged with
  * another entry.
  */
 #define	MAP_ENTRY_NOMERGE_MASK	(MAP_ENTRY_GROWS_DOWN | MAP_ENTRY_GROWS_UP | \
 	    MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_IS_SUB_MAP | MAP_ENTRY_VN_EXEC)
 
 static bool
 vm_map_mergeable_neighbors(vm_map_entry_t prev, vm_map_entry_t entry)
 {
 
 	KASSERT((prev->eflags & MAP_ENTRY_NOMERGE_MASK) == 0 ||
 	    (entry->eflags & MAP_ENTRY_NOMERGE_MASK) == 0,
 	    ("vm_map_mergeable_neighbors: neither %p nor %p are mergeable",
 	    prev, entry));
 	return (prev->end == entry->start &&
 	    prev->object.vm_object == entry->object.vm_object &&
 	    (prev->object.vm_object == NULL ||
 	    prev->offset + (prev->end - prev->start) == entry->offset) &&
 	    prev->eflags == entry->eflags &&
 	    prev->protection == entry->protection &&
 	    prev->max_protection == entry->max_protection &&
 	    prev->inheritance == entry->inheritance &&
 	    prev->wired_count == entry->wired_count &&
 	    prev->cred == entry->cred);
 }
 
 static void
 vm_map_merged_neighbor_dispose(vm_map_t map, vm_map_entry_t entry)
 {
 
 	/*
 	 * If the backing object is a vnode object, vm_object_deallocate()
 	 * calls vrele().  However, vrele() does not lock the vnode because
 	 * the vnode has additional references.  Thus, the map lock can be
 	 * kept without causing a lock-order reversal with the vnode lock.
 	 *
 	 * Since we count the number of virtual page mappings in
 	 * object->un_pager.vnp.writemappings, the writemappings value
 	 * should not be adjusted when the entry is disposed of.
 	 */
 	if (entry->object.vm_object != NULL)
 		vm_object_deallocate(entry->object.vm_object);
 	if (entry->cred != NULL)
 		crfree(entry->cred);
 	vm_map_entry_dispose(map, entry);
 }
 
 /*
  *	vm_map_try_merge_entries:
  *
  *	Compare the given map entry to its predecessor, and merge its precessor
  *	into it if possible.  The entry remains valid, and may be extended.
  *	The predecessor may be deleted.
  *
  *	The map must be locked.
  */
 void
 vm_map_try_merge_entries(vm_map_t map, vm_map_entry_t prev_entry,
     vm_map_entry_t entry)
 {
 
 	VM_MAP_ASSERT_LOCKED(map);
 	if ((entry->eflags & MAP_ENTRY_NOMERGE_MASK) == 0 &&
 	    vm_map_mergeable_neighbors(prev_entry, entry)) {
 		vm_map_entry_unlink(map, prev_entry, UNLINK_MERGE_NEXT);
 		vm_map_merged_neighbor_dispose(map, prev_entry);
 	}
 }
 
 /*
  *	vm_map_entry_back:
  *
  *	Allocate an object to back a map entry.
  */
 static inline void
 vm_map_entry_back(vm_map_entry_t entry)
 {
 	vm_object_t object;
 
 	KASSERT(entry->object.vm_object == NULL,
 	    ("map entry %p has backing object", entry));
 	KASSERT((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0,
 	    ("map entry %p is a submap", entry));
 	object = vm_object_allocate_anon(atop(entry->end - entry->start), NULL,
 	    entry->cred, entry->end - entry->start);
 	entry->object.vm_object = object;
 	entry->offset = 0;
 	entry->cred = NULL;
 }
 
 /*
  *	vm_map_entry_charge_object
  *
  *	If there is no object backing this entry, create one.  Otherwise, if
  *	the entry has cred, give it to the backing object.
  */
 static inline void
 vm_map_entry_charge_object(vm_map_t map, vm_map_entry_t entry)
 {
 
 	VM_MAP_ASSERT_LOCKED(map);
 	KASSERT((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0,
 	    ("map entry %p is a submap", entry));
 	if (entry->object.vm_object == NULL && !map->system_map &&
 	    (entry->eflags & MAP_ENTRY_GUARD) == 0)
 		vm_map_entry_back(entry);
 	else if (entry->object.vm_object != NULL &&
 	    ((entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) &&
 	    entry->cred != NULL) {
 		VM_OBJECT_WLOCK(entry->object.vm_object);
 		KASSERT(entry->object.vm_object->cred == NULL,
 		    ("OVERCOMMIT: %s: both cred e %p", __func__, entry));
 		entry->object.vm_object->cred = entry->cred;
 		entry->object.vm_object->charge = entry->end - entry->start;
 		VM_OBJECT_WUNLOCK(entry->object.vm_object);
 		entry->cred = NULL;
 	}
 }
 
 /*
  *	vm_map_entry_clone
  *
  *	Create a duplicate map entry for clipping.
  */
 static vm_map_entry_t
 vm_map_entry_clone(vm_map_t map, vm_map_entry_t entry)
 {
 	vm_map_entry_t new_entry;
 
 	VM_MAP_ASSERT_LOCKED(map);
 
 	/*
 	 * Create a backing object now, if none exists, so that more individual
 	 * objects won't be created after the map entry is split.
 	 */
 	vm_map_entry_charge_object(map, entry);
 
 	/* Clone the entry. */
 	new_entry = vm_map_entry_create(map);
 	*new_entry = *entry;
 	if (new_entry->cred != NULL)
 		crhold(entry->cred);
 	if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
 		vm_object_reference(new_entry->object.vm_object);
 		vm_map_entry_set_vnode_text(new_entry, true);
 		/*
 		 * The object->un_pager.vnp.writemappings for the object of
 		 * MAP_ENTRY_WRITECNT type entry shall be kept as is here.  The
 		 * virtual pages are re-distributed among the clipped entries,
 		 * so the sum is left the same.
 		 */
 	}
 	return (new_entry);
 }
 
 /*
  *	vm_map_clip_start:	[ internal use only ]
  *
  *	Asserts that the given entry begins at or after
  *	the specified address; if necessary,
  *	it splits the entry into two.
  */
 static inline void
 vm_map_clip_start(vm_map_t map, vm_map_entry_t entry, vm_offset_t start)
 {
 	vm_map_entry_t new_entry;
 
 	if (start <= entry->start)
 		return;
 
 	VM_MAP_ASSERT_LOCKED(map);
 	KASSERT(entry->end > start && entry->start < start,
 	    ("%s: invalid clip of entry %p", __func__, entry));
 
 	new_entry = vm_map_entry_clone(map, entry);
 
 	/*
 	 * Split off the front portion.  Insert the new entry BEFORE this one,
 	 * so that this entry has the specified starting address.
 	 */
 	new_entry->end = start;
 	vm_map_entry_link(map, new_entry);
 }
 
 /*
  *	vm_map_lookup_clip_start:
  *
  *	Find the entry at or just after 'start', and clip it if 'start' is in
  *	the interior of the entry.  Return entry after 'start', and in
  *	prev_entry set the entry before 'start'.
  */
 static inline vm_map_entry_t
 vm_map_lookup_clip_start(vm_map_t map, vm_offset_t start,
     vm_map_entry_t *prev_entry)
 {
 	vm_map_entry_t entry;
 
 	if (vm_map_lookup_entry(map, start, prev_entry)) {
 		entry = *prev_entry;
 		vm_map_clip_start(map, entry, start);
 		*prev_entry = vm_map_entry_pred(entry);
 	} else
 		entry = vm_map_entry_succ(*prev_entry);
 	return (entry);
 }
 
 /*
  *	vm_map_clip_end:	[ internal use only ]
  *
  *	Asserts that the given entry ends at or before
  *	the specified address; if necessary,
  *	it splits the entry into two.
  */
 static inline void
 vm_map_clip_end(vm_map_t map, vm_map_entry_t entry, vm_offset_t end)
 {
 	vm_map_entry_t new_entry;
 
 	if (end >= entry->end)
 		return;
 
 	VM_MAP_ASSERT_LOCKED(map);
 	KASSERT(entry->start < end && entry->end > end,
 	    ("%s: invalid clip of entry %p", __func__, entry));
 
 	new_entry = vm_map_entry_clone(map, entry);
 
 	/*
 	 * Split off the back portion.  Insert the new entry AFTER this one,
 	 * so that this entry has the specified ending address.
 	 */
 	new_entry->start = end;
 	vm_map_entry_link(map, new_entry);
 }
 
 /*
  *	vm_map_submap:		[ kernel use only ]
  *
  *	Mark the given range as handled by a subordinate map.
  *
  *	This range must have been created with vm_map_find,
  *	and no other operations may have been performed on this
  *	range prior to calling vm_map_submap.
  *
  *	Only a limited number of operations can be performed
  *	within this rage after calling vm_map_submap:
  *		vm_fault
  *	[Don't try vm_map_copy!]
  *
  *	To remove a submapping, one must first remove the
  *	range from the superior map, and then destroy the
  *	submap (if desired).  [Better yet, don't try it.]
  */
 int
 vm_map_submap(
 	vm_map_t map,
 	vm_offset_t start,
 	vm_offset_t end,
 	vm_map_t submap)
 {
 	vm_map_entry_t entry;
 	int result;
 
 	result = KERN_INVALID_ARGUMENT;
 
 	vm_map_lock(submap);
 	submap->flags |= MAP_IS_SUB_MAP;
 	vm_map_unlock(submap);
 
 	vm_map_lock(map);
 	VM_MAP_RANGE_CHECK(map, start, end);
 	if (vm_map_lookup_entry(map, start, &entry) && entry->end >= end &&
 	    (entry->eflags & MAP_ENTRY_COW) == 0 &&
 	    entry->object.vm_object == NULL) {
 		vm_map_clip_start(map, entry, start);
 		vm_map_clip_end(map, entry, end);
 		entry->object.sub_map = submap;
 		entry->eflags |= MAP_ENTRY_IS_SUB_MAP;
 		result = KERN_SUCCESS;
 	}
 	vm_map_unlock(map);
 
 	if (result != KERN_SUCCESS) {
 		vm_map_lock(submap);
 		submap->flags &= ~MAP_IS_SUB_MAP;
 		vm_map_unlock(submap);
 	}
 	return (result);
 }
 
 /*
  * The maximum number of pages to map if MAP_PREFAULT_PARTIAL is specified
  */
 #define	MAX_INIT_PT	96
 
 /*
  *	vm_map_pmap_enter:
  *
  *	Preload the specified map's pmap with mappings to the specified
  *	object's memory-resident pages.  No further physical pages are
  *	allocated, and no further virtual pages are retrieved from secondary
  *	storage.  If the specified flags include MAP_PREFAULT_PARTIAL, then a
  *	limited number of page mappings are created at the low-end of the
  *	specified address range.  (For this purpose, a superpage mapping
  *	counts as one page mapping.)  Otherwise, all resident pages within
  *	the specified address range are mapped.
  */
 static void
 vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_prot_t prot,
     vm_object_t object, vm_pindex_t pindex, vm_size_t size, int flags)
 {
 	vm_offset_t start;
 	vm_page_t p, p_start;
 	vm_pindex_t mask, psize, threshold, tmpidx;
 
 	if ((prot & (VM_PROT_READ | VM_PROT_EXECUTE)) == 0 || object == NULL)
 		return;
 	if (object->type == OBJT_DEVICE || object->type == OBJT_SG) {
 		VM_OBJECT_WLOCK(object);
 		if (object->type == OBJT_DEVICE || object->type == OBJT_SG) {
 			pmap_object_init_pt(map->pmap, addr, object, pindex,
 			    size);
 			VM_OBJECT_WUNLOCK(object);
 			return;
 		}
 		VM_OBJECT_LOCK_DOWNGRADE(object);
 	} else
 		VM_OBJECT_RLOCK(object);
 
 	psize = atop(size);
 	if (psize + pindex > object->size) {
 		if (pindex >= object->size) {
 			VM_OBJECT_RUNLOCK(object);
 			return;
 		}
 		psize = object->size - pindex;
 	}
 
 	start = 0;
 	p_start = NULL;
 	threshold = MAX_INIT_PT;
 
 	p = vm_page_find_least(object, pindex);
 	/*
 	 * Assert: the variable p is either (1) the page with the
 	 * least pindex greater than or equal to the parameter pindex
 	 * or (2) NULL.
 	 */
 	for (;
 	     p != NULL && (tmpidx = p->pindex - pindex) < psize;
 	     p = TAILQ_NEXT(p, listq)) {
 		/*
 		 * don't allow an madvise to blow away our really
 		 * free pages allocating pv entries.
 		 */
 		if (((flags & MAP_PREFAULT_MADVISE) != 0 &&
 		    vm_page_count_severe()) ||
 		    ((flags & MAP_PREFAULT_PARTIAL) != 0 &&
 		    tmpidx >= threshold)) {
 			psize = tmpidx;
 			break;
 		}
 		if (vm_page_all_valid(p)) {
 			if (p_start == NULL) {
 				start = addr + ptoa(tmpidx);
 				p_start = p;
 			}
 			/* Jump ahead if a superpage mapping is possible. */
 			if (p->psind > 0 && ((addr + ptoa(tmpidx)) &
 			    (pagesizes[p->psind] - 1)) == 0) {
 				mask = atop(pagesizes[p->psind]) - 1;
 				if (tmpidx + mask < psize &&
 				    vm_page_ps_test(p, PS_ALL_VALID, NULL)) {
 					p += mask;
 					threshold += mask;
 				}
 			}
 		} else if (p_start != NULL) {
 			pmap_enter_object(map->pmap, start, addr +
 			    ptoa(tmpidx), p_start, prot);
 			p_start = NULL;
 		}
 	}
 	if (p_start != NULL)
 		pmap_enter_object(map->pmap, start, addr + ptoa(psize),
 		    p_start, prot);
 	VM_OBJECT_RUNLOCK(object);
 }
 
 /*
  *	vm_map_protect:
  *
  *	Sets the protection of the specified address
  *	region in the target map.  If "set_max" is
  *	specified, the maximum protection is to be set;
  *	otherwise, only the current protection is affected.
  */
 int
 vm_map_protect(vm_map_t map, vm_offset_t start, vm_offset_t end,
 	       vm_prot_t new_prot, boolean_t set_max)
 {
 	vm_map_entry_t entry, first_entry, in_tran, prev_entry;
 	vm_object_t obj;
 	struct ucred *cred;
 	vm_prot_t old_prot;
 	int rv;
 
 	if (start == end)
 		return (KERN_SUCCESS);
 
 again:
 	in_tran = NULL;
 	vm_map_lock(map);
 
 	/*
 	 * Ensure that we are not concurrently wiring pages.  vm_map_wire() may
 	 * need to fault pages into the map and will drop the map lock while
 	 * doing so, and the VM object may end up in an inconsistent state if we
 	 * update the protection on the map entry in between faults.
 	 */
 	vm_map_wait_busy(map);
 
 	VM_MAP_RANGE_CHECK(map, start, end);
 
 	if (!vm_map_lookup_entry(map, start, &first_entry))
 		first_entry = vm_map_entry_succ(first_entry);
 
 	/*
 	 * Make a first pass to check for protection violations.
 	 */
 	for (entry = first_entry; entry->start < end;
 	    entry = vm_map_entry_succ(entry)) {
 		if ((entry->eflags & MAP_ENTRY_GUARD) != 0)
 			continue;
 		if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0) {
 			vm_map_unlock(map);
 			return (KERN_INVALID_ARGUMENT);
 		}
 		if ((new_prot & entry->max_protection) != new_prot) {
 			vm_map_unlock(map);
 			return (KERN_PROTECTION_FAILURE);
 		}
 		if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0)
 			in_tran = entry;
 	}
 
 	/*
 	 * Postpone the operation until all in-transition map entries have
 	 * stabilized.  An in-transition entry might already have its pages
 	 * wired and wired_count incremented, but not yet have its
 	 * MAP_ENTRY_USER_WIRED flag set.  In which case, we would fail to call
 	 * vm_fault_copy_entry() in the final loop below.
 	 */
 	if (in_tran != NULL) {
 		in_tran->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
 		vm_map_unlock_and_wait(map, 0);
 		goto again;
 	}
 
 	/*
 	 * Before changing the protections, try to reserve swap space for any
 	 * private (i.e., copy-on-write) mappings that are transitioning from
 	 * read-only to read/write access.  If a reservation fails, break out
 	 * of this loop early and let the next loop simplify the entries, since
 	 * some may now be mergeable.
 	 */
 	rv = KERN_SUCCESS;
 	vm_map_clip_start(map, first_entry, start);
 	for (entry = first_entry; entry->start < end;
 	    entry = vm_map_entry_succ(entry)) {
 		vm_map_clip_end(map, entry, end);
 
 		if (set_max ||
 		    ((new_prot & ~entry->protection) & VM_PROT_WRITE) == 0 ||
 		    ENTRY_CHARGED(entry) ||
 		    (entry->eflags & MAP_ENTRY_GUARD) != 0) {
 			continue;
 		}
 
 		cred = curthread->td_ucred;
 		obj = entry->object.vm_object;
 
 		if (obj == NULL ||
 		    (entry->eflags & MAP_ENTRY_NEEDS_COPY) != 0) {
 			if (!swap_reserve(entry->end - entry->start)) {
 				rv = KERN_RESOURCE_SHORTAGE;
 				end = entry->end;
 				break;
 			}
 			crhold(cred);
 			entry->cred = cred;
 			continue;
 		}
 
 		if (obj->type != OBJT_DEFAULT && obj->type != OBJT_SWAP)
 			continue;
 		VM_OBJECT_WLOCK(obj);
 		if (obj->type != OBJT_DEFAULT && obj->type != OBJT_SWAP) {
 			VM_OBJECT_WUNLOCK(obj);
 			continue;
 		}
 
 		/*
 		 * Charge for the whole object allocation now, since
 		 * we cannot distinguish between non-charged and
 		 * charged clipped mapping of the same object later.
 		 */
 		KASSERT(obj->charge == 0,
 		    ("vm_map_protect: object %p overcharged (entry %p)",
 		    obj, entry));
 		if (!swap_reserve(ptoa(obj->size))) {
 			VM_OBJECT_WUNLOCK(obj);
 			rv = KERN_RESOURCE_SHORTAGE;
 			end = entry->end;
 			break;
 		}
 
 		crhold(cred);
 		obj->cred = cred;
 		obj->charge = ptoa(obj->size);
 		VM_OBJECT_WUNLOCK(obj);
 	}
 
 	/*
 	 * If enough swap space was available, go back and fix up protections.
 	 * Otherwise, just simplify entries, since some may have been modified.
 	 * [Note that clipping is not necessary the second time.]
 	 */
 	for (prev_entry = vm_map_entry_pred(first_entry), entry = first_entry;
 	    entry->start < end;
 	    vm_map_try_merge_entries(map, prev_entry, entry),
 	    prev_entry = entry, entry = vm_map_entry_succ(entry)) {
 		if (rv != KERN_SUCCESS ||
 		    (entry->eflags & MAP_ENTRY_GUARD) != 0)
 			continue;
 
 		old_prot = entry->protection;
 
 		if (set_max)
 			entry->protection =
 			    (entry->max_protection = new_prot) &
 			    old_prot;
 		else
 			entry->protection = new_prot;
 
 		/*
 		 * For user wired map entries, the normal lazy evaluation of
 		 * write access upgrades through soft page faults is
 		 * undesirable.  Instead, immediately copy any pages that are
 		 * copy-on-write and enable write access in the physical map.
 		 */
 		if ((entry->eflags & MAP_ENTRY_USER_WIRED) != 0 &&
 		    (entry->protection & VM_PROT_WRITE) != 0 &&
 		    (old_prot & VM_PROT_WRITE) == 0)
 			vm_fault_copy_entry(map, map, entry, entry, NULL);
 
 		/*
 		 * When restricting access, update the physical map.  Worry
 		 * about copy-on-write here.
 		 */
 		if ((old_prot & ~entry->protection) != 0) {
 #define MASK(entry)	(((entry)->eflags & MAP_ENTRY_COW) ? ~VM_PROT_WRITE : \
 							VM_PROT_ALL)
 			pmap_protect(map->pmap, entry->start,
 			    entry->end,
 			    entry->protection & MASK(entry));
 #undef	MASK
 		}
 	}
 	vm_map_try_merge_entries(map, prev_entry, entry);
 	vm_map_unlock(map);
 	return (rv);
 }
 
 /*
  *	vm_map_madvise:
  *
  *	This routine traverses a processes map handling the madvise
  *	system call.  Advisories are classified as either those effecting
  *	the vm_map_entry structure, or those effecting the underlying
  *	objects.
  */
 int
 vm_map_madvise(
 	vm_map_t map,
 	vm_offset_t start,
 	vm_offset_t end,
 	int behav)
 {
 	vm_map_entry_t entry, prev_entry;
 	bool modify_map;
 
 	/*
 	 * Some madvise calls directly modify the vm_map_entry, in which case
 	 * we need to use an exclusive lock on the map and we need to perform
 	 * various clipping operations.  Otherwise we only need a read-lock
 	 * on the map.
 	 */
 	switch(behav) {
 	case MADV_NORMAL:
 	case MADV_SEQUENTIAL:
 	case MADV_RANDOM:
 	case MADV_NOSYNC:
 	case MADV_AUTOSYNC:
 	case MADV_NOCORE:
 	case MADV_CORE:
 		if (start == end)
 			return (0);
 		modify_map = true;
 		vm_map_lock(map);
 		break;
 	case MADV_WILLNEED:
 	case MADV_DONTNEED:
 	case MADV_FREE:
 		if (start == end)
 			return (0);
 		modify_map = false;
 		vm_map_lock_read(map);
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	/*
 	 * Locate starting entry and clip if necessary.
 	 */
 	VM_MAP_RANGE_CHECK(map, start, end);
 
 	if (modify_map) {
 		/*
 		 * madvise behaviors that are implemented in the vm_map_entry.
 		 *
 		 * We clip the vm_map_entry so that behavioral changes are
 		 * limited to the specified address range.
 		 */
 		for (entry = vm_map_lookup_clip_start(map, start, &prev_entry);
 		    entry->start < end;
 		    prev_entry = entry, entry = vm_map_entry_succ(entry)) {
 			if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0)
 				continue;
 
 			vm_map_clip_end(map, entry, end);
 
 			switch (behav) {
 			case MADV_NORMAL:
 				vm_map_entry_set_behavior(entry,
 				    MAP_ENTRY_BEHAV_NORMAL);
 				break;
 			case MADV_SEQUENTIAL:
 				vm_map_entry_set_behavior(entry,
 				    MAP_ENTRY_BEHAV_SEQUENTIAL);
 				break;
 			case MADV_RANDOM:
 				vm_map_entry_set_behavior(entry,
 				    MAP_ENTRY_BEHAV_RANDOM);
 				break;
 			case MADV_NOSYNC:
 				entry->eflags |= MAP_ENTRY_NOSYNC;
 				break;
 			case MADV_AUTOSYNC:
 				entry->eflags &= ~MAP_ENTRY_NOSYNC;
 				break;
 			case MADV_NOCORE:
 				entry->eflags |= MAP_ENTRY_NOCOREDUMP;
 				break;
 			case MADV_CORE:
 				entry->eflags &= ~MAP_ENTRY_NOCOREDUMP;
 				break;
 			default:
 				break;
 			}
 			vm_map_try_merge_entries(map, prev_entry, entry);
 		}
 		vm_map_try_merge_entries(map, prev_entry, entry);
 		vm_map_unlock(map);
 	} else {
 		vm_pindex_t pstart, pend;
 
 		/*
 		 * madvise behaviors that are implemented in the underlying
 		 * vm_object.
 		 *
 		 * Since we don't clip the vm_map_entry, we have to clip
 		 * the vm_object pindex and count.
 		 */
 		if (!vm_map_lookup_entry(map, start, &entry))
 			entry = vm_map_entry_succ(entry);
 		for (; entry->start < end;
 		    entry = vm_map_entry_succ(entry)) {
 			vm_offset_t useEnd, useStart;
 
 			if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0)
 				continue;
 
 			/*
 			 * MADV_FREE would otherwise rewind time to
 			 * the creation of the shadow object.  Because
 			 * we hold the VM map read-locked, neither the
 			 * entry's object nor the presence of a
 			 * backing object can change.
 			 */
 			if (behav == MADV_FREE &&
 			    entry->object.vm_object != NULL &&
 			    entry->object.vm_object->backing_object != NULL)
 				continue;
 
 			pstart = OFF_TO_IDX(entry->offset);
 			pend = pstart + atop(entry->end - entry->start);
 			useStart = entry->start;
 			useEnd = entry->end;
 
 			if (entry->start < start) {
 				pstart += atop(start - entry->start);
 				useStart = start;
 			}
 			if (entry->end > end) {
 				pend -= atop(entry->end - end);
 				useEnd = end;
 			}
 
 			if (pstart >= pend)
 				continue;
 
 			/*
 			 * Perform the pmap_advise() before clearing
 			 * PGA_REFERENCED in vm_page_advise().  Otherwise, a
 			 * concurrent pmap operation, such as pmap_remove(),
 			 * could clear a reference in the pmap and set
 			 * PGA_REFERENCED on the page before the pmap_advise()
 			 * had completed.  Consequently, the page would appear
 			 * referenced based upon an old reference that
 			 * occurred before this pmap_advise() ran.
 			 */
 			if (behav == MADV_DONTNEED || behav == MADV_FREE)
 				pmap_advise(map->pmap, useStart, useEnd,
 				    behav);
 
 			vm_object_madvise(entry->object.vm_object, pstart,
 			    pend, behav);
 
 			/*
 			 * Pre-populate paging structures in the
 			 * WILLNEED case.  For wired entries, the
 			 * paging structures are already populated.
 			 */
 			if (behav == MADV_WILLNEED &&
 			    entry->wired_count == 0) {
 				vm_map_pmap_enter(map,
 				    useStart,
 				    entry->protection,
 				    entry->object.vm_object,
 				    pstart,
 				    ptoa(pend - pstart),
 				    MAP_PREFAULT_MADVISE
 				);
 			}
 		}
 		vm_map_unlock_read(map);
 	}
 	return (0);
 }
 
 
 /*
  *	vm_map_inherit:
  *
  *	Sets the inheritance of the specified address
  *	range in the target map.  Inheritance
  *	affects how the map will be shared with
  *	child maps at the time of vmspace_fork.
  */
 int
 vm_map_inherit(vm_map_t map, vm_offset_t start, vm_offset_t end,
 	       vm_inherit_t new_inheritance)
 {
 	vm_map_entry_t entry, prev_entry;
 
 	switch (new_inheritance) {
 	case VM_INHERIT_NONE:
 	case VM_INHERIT_COPY:
 	case VM_INHERIT_SHARE:
 	case VM_INHERIT_ZERO:
 		break;
 	default:
 		return (KERN_INVALID_ARGUMENT);
 	}
 	if (start == end)
 		return (KERN_SUCCESS);
 	vm_map_lock(map);
 	VM_MAP_RANGE_CHECK(map, start, end);
 	for (entry = vm_map_lookup_clip_start(map, start, &prev_entry);
 	    entry->start < end;
 	    prev_entry = entry, entry = vm_map_entry_succ(entry)) {
 		vm_map_clip_end(map, entry, end);
 		if ((entry->eflags & MAP_ENTRY_GUARD) == 0 ||
 		    new_inheritance != VM_INHERIT_ZERO)
 			entry->inheritance = new_inheritance;
 		vm_map_try_merge_entries(map, prev_entry, entry);
 	}
 	vm_map_try_merge_entries(map, prev_entry, entry);
 	vm_map_unlock(map);
 	return (KERN_SUCCESS);
 }
 
 /*
  *	vm_map_entry_in_transition:
  *
  *	Release the map lock, and sleep until the entry is no longer in
  *	transition.  Awake and acquire the map lock.  If the map changed while
  *	another held the lock, lookup a possibly-changed entry at or after the
  *	'start' position of the old entry.
  */
 static vm_map_entry_t
 vm_map_entry_in_transition(vm_map_t map, vm_offset_t in_start,
     vm_offset_t *io_end, bool holes_ok, vm_map_entry_t in_entry)
 {
 	vm_map_entry_t entry;
 	vm_offset_t start;
 	u_int last_timestamp;
 
 	VM_MAP_ASSERT_LOCKED(map);
 	KASSERT((in_entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0,
 	    ("not in-tranition map entry %p", in_entry));
 	/*
 	 * We have not yet clipped the entry.
 	 */
 	start = MAX(in_start, in_entry->start);
 	in_entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
 	last_timestamp = map->timestamp;
 	if (vm_map_unlock_and_wait(map, 0)) {
 		/*
 		 * Allow interruption of user wiring/unwiring?
 		 */
 	}
 	vm_map_lock(map);
 	if (last_timestamp + 1 == map->timestamp)
 		return (in_entry);
 
 	/*
 	 * Look again for the entry because the map was modified while it was
 	 * unlocked.  Specifically, the entry may have been clipped, merged, or
 	 * deleted.
 	 */
 	if (!vm_map_lookup_entry(map, start, &entry)) {
 		if (!holes_ok) {
 			*io_end = start;
 			return (NULL);
 		}
 		entry = vm_map_entry_succ(entry);
 	}
 	return (entry);
 }
 
 /*
  *	vm_map_unwire:
  *
  *	Implements both kernel and user unwiring.
  */
 int
 vm_map_unwire(vm_map_t map, vm_offset_t start, vm_offset_t end,
     int flags)
 {
 	vm_map_entry_t entry, first_entry, next_entry, prev_entry;
 	int rv;
 	bool holes_ok, need_wakeup, user_unwire;
 
 	if (start == end)
 		return (KERN_SUCCESS);
 	holes_ok = (flags & VM_MAP_WIRE_HOLESOK) != 0;
 	user_unwire = (flags & VM_MAP_WIRE_USER) != 0;
 	vm_map_lock(map);
 	VM_MAP_RANGE_CHECK(map, start, end);
 	if (!vm_map_lookup_entry(map, start, &first_entry)) {
 		if (holes_ok)
 			first_entry = vm_map_entry_succ(first_entry);
 		else {
 			vm_map_unlock(map);
 			return (KERN_INVALID_ADDRESS);
 		}
 	}
 	rv = KERN_SUCCESS;
 	for (entry = first_entry; entry->start < end; entry = next_entry) {
 		if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
 			/*
 			 * We have not yet clipped the entry.
 			 */
 			next_entry = vm_map_entry_in_transition(map, start,
 			    &end, holes_ok, entry);
 			if (next_entry == NULL) {
 				if (entry == first_entry) {
 					vm_map_unlock(map);
 					return (KERN_INVALID_ADDRESS);
 				}
 				rv = KERN_INVALID_ADDRESS;
 				break;
 			}
 			first_entry = (entry == first_entry) ?
 			    next_entry : NULL;
 			continue;
 		}
 		vm_map_clip_start(map, entry, start);
 		vm_map_clip_end(map, entry, end);
 		/*
 		 * Mark the entry in case the map lock is released.  (See
 		 * above.)
 		 */
 		KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 &&
 		    entry->wiring_thread == NULL,
 		    ("owned map entry %p", entry));
 		entry->eflags |= MAP_ENTRY_IN_TRANSITION;
 		entry->wiring_thread = curthread;
 		next_entry = vm_map_entry_succ(entry);
 		/*
 		 * Check the map for holes in the specified region.
 		 * If holes_ok, skip this check.
 		 */
 		if (!holes_ok &&
 		    entry->end < end && next_entry->start > entry->end) {
 			end = entry->end;
 			rv = KERN_INVALID_ADDRESS;
 			break;
 		}
 		/*
 		 * If system unwiring, require that the entry is system wired.
 		 */
 		if (!user_unwire &&
 		    vm_map_entry_system_wired_count(entry) == 0) {
 			end = entry->end;
 			rv = KERN_INVALID_ARGUMENT;
 			break;
 		}
 	}
 	need_wakeup = false;
 	if (first_entry == NULL &&
 	    !vm_map_lookup_entry(map, start, &first_entry)) {
 		KASSERT(holes_ok, ("vm_map_unwire: lookup failed"));
 		prev_entry = first_entry;
 		entry = vm_map_entry_succ(first_entry);
 	} else {
 		prev_entry = vm_map_entry_pred(first_entry);
 		entry = first_entry;
 	}
 	for (; entry->start < end;
 	    prev_entry = entry, entry = vm_map_entry_succ(entry)) {
 		/*
 		 * If holes_ok was specified, an empty
 		 * space in the unwired region could have been mapped
 		 * while the map lock was dropped for draining
 		 * MAP_ENTRY_IN_TRANSITION.  Moreover, another thread
 		 * could be simultaneously wiring this new mapping
 		 * entry.  Detect these cases and skip any entries
 		 * marked as in transition by us.
 		 */
 		if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 ||
 		    entry->wiring_thread != curthread) {
 			KASSERT(holes_ok,
 			    ("vm_map_unwire: !HOLESOK and new/changed entry"));
 			continue;
 		}
 
 		if (rv == KERN_SUCCESS && (!user_unwire ||
 		    (entry->eflags & MAP_ENTRY_USER_WIRED))) {
 			if (entry->wired_count == 1)
 				vm_map_entry_unwire(map, entry);
 			else
 				entry->wired_count--;
 			if (user_unwire)
 				entry->eflags &= ~MAP_ENTRY_USER_WIRED;
 		}
 		KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0,
 		    ("vm_map_unwire: in-transition flag missing %p", entry));
 		KASSERT(entry->wiring_thread == curthread,
 		    ("vm_map_unwire: alien wire %p", entry));
 		entry->eflags &= ~MAP_ENTRY_IN_TRANSITION;
 		entry->wiring_thread = NULL;
 		if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) {
 			entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP;
 			need_wakeup = true;
 		}
 		vm_map_try_merge_entries(map, prev_entry, entry);
 	}
 	vm_map_try_merge_entries(map, prev_entry, entry);
 	vm_map_unlock(map);
 	if (need_wakeup)
 		vm_map_wakeup(map);
 	return (rv);
 }
 
 static void
 vm_map_wire_user_count_sub(u_long npages)
 {
 
 	atomic_subtract_long(&vm_user_wire_count, npages);
 }
 
 static bool
 vm_map_wire_user_count_add(u_long npages)
 {
 	u_long wired;
 
 	wired = vm_user_wire_count;
 	do {
 		if (npages + wired > vm_page_max_user_wired)
 			return (false);
 	} while (!atomic_fcmpset_long(&vm_user_wire_count, &wired,
 	    npages + wired));
 
 	return (true);
 }
 
 /*
  *	vm_map_wire_entry_failure:
  *
  *	Handle a wiring failure on the given entry.
  *
  *	The map should be locked.
  */
 static void
 vm_map_wire_entry_failure(vm_map_t map, vm_map_entry_t entry,
     vm_offset_t failed_addr)
 {
 
 	VM_MAP_ASSERT_LOCKED(map);
 	KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0 &&
 	    entry->wired_count == 1,
 	    ("vm_map_wire_entry_failure: entry %p isn't being wired", entry));
 	KASSERT(failed_addr < entry->end,
 	    ("vm_map_wire_entry_failure: entry %p was fully wired", entry));
 
 	/*
 	 * If any pages at the start of this entry were successfully wired,
 	 * then unwire them.
 	 */
 	if (failed_addr > entry->start) {
 		pmap_unwire(map->pmap, entry->start, failed_addr);
 		vm_object_unwire(entry->object.vm_object, entry->offset,
 		    failed_addr - entry->start, PQ_ACTIVE);
 	}
 
 	/*
 	 * Assign an out-of-range value to represent the failure to wire this
 	 * entry.
 	 */
 	entry->wired_count = -1;
 }
 
 int
 vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t end, int flags)
 {
 	int rv;
 
 	vm_map_lock(map);
 	rv = vm_map_wire_locked(map, start, end, flags);
 	vm_map_unlock(map);
 	return (rv);
 }
 
 
 /*
  *	vm_map_wire_locked:
  *
  *	Implements both kernel and user wiring.  Returns with the map locked,
  *	the map lock may be dropped.
  */
 int
 vm_map_wire_locked(vm_map_t map, vm_offset_t start, vm_offset_t end, int flags)
 {
 	vm_map_entry_t entry, first_entry, next_entry, prev_entry;
 	vm_offset_t faddr, saved_end, saved_start;
 	u_long npages;
 	u_int last_timestamp;
 	int rv;
 	bool holes_ok, need_wakeup, user_wire;
 	vm_prot_t prot;
 
 	VM_MAP_ASSERT_LOCKED(map);
 
 	if (start == end)
 		return (KERN_SUCCESS);
 	prot = 0;
 	if (flags & VM_MAP_WIRE_WRITE)
 		prot |= VM_PROT_WRITE;
 	holes_ok = (flags & VM_MAP_WIRE_HOLESOK) != 0;
 	user_wire = (flags & VM_MAP_WIRE_USER) != 0;
 	VM_MAP_RANGE_CHECK(map, start, end);
 	if (!vm_map_lookup_entry(map, start, &first_entry)) {
 		if (holes_ok)
 			first_entry = vm_map_entry_succ(first_entry);
 		else
 			return (KERN_INVALID_ADDRESS);
 	}
 	for (entry = first_entry; entry->start < end; entry = next_entry) {
 		if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
 			/*
 			 * We have not yet clipped the entry.
 			 */
 			next_entry = vm_map_entry_in_transition(map, start,
 			    &end, holes_ok, entry);
 			if (next_entry == NULL) {
 				if (entry == first_entry)
 					return (KERN_INVALID_ADDRESS);
 				rv = KERN_INVALID_ADDRESS;
 				goto done;
 			}
 			first_entry = (entry == first_entry) ?
 			    next_entry : NULL;
 			continue;
 		}
 		vm_map_clip_start(map, entry, start);
 		vm_map_clip_end(map, entry, end);
 		/*
 		 * Mark the entry in case the map lock is released.  (See
 		 * above.)
 		 */
 		KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 &&
 		    entry->wiring_thread == NULL,
 		    ("owned map entry %p", entry));
 		entry->eflags |= MAP_ENTRY_IN_TRANSITION;
 		entry->wiring_thread = curthread;
 		if ((entry->protection & (VM_PROT_READ | VM_PROT_EXECUTE)) == 0
 		    || (entry->protection & prot) != prot) {
 			entry->eflags |= MAP_ENTRY_WIRE_SKIPPED;
 			if (!holes_ok) {
 				end = entry->end;
 				rv = KERN_INVALID_ADDRESS;
 				goto done;
 			}
 		} else if (entry->wired_count == 0) {
 			entry->wired_count++;
 
 			npages = atop(entry->end - entry->start);
 			if (user_wire && !vm_map_wire_user_count_add(npages)) {
 				vm_map_wire_entry_failure(map, entry,
 				    entry->start);
 				end = entry->end;
 				rv = KERN_RESOURCE_SHORTAGE;
 				goto done;
 			}
 
 			/*
 			 * Release the map lock, relying on the in-transition
 			 * mark.  Mark the map busy for fork.
 			 */
 			saved_start = entry->start;
 			saved_end = entry->end;
 			last_timestamp = map->timestamp;
 			vm_map_busy(map);
 			vm_map_unlock(map);
 
 			faddr = saved_start;
 			do {
 				/*
 				 * Simulate a fault to get the page and enter
 				 * it into the physical map.
 				 */
 				if ((rv = vm_fault(map, faddr,
 				    VM_PROT_NONE, VM_FAULT_WIRE, NULL)) !=
 				    KERN_SUCCESS)
 					break;
 			} while ((faddr += PAGE_SIZE) < saved_end);
 			vm_map_lock(map);
 			vm_map_unbusy(map);
 			if (last_timestamp + 1 != map->timestamp) {
 				/*
 				 * Look again for the entry because the map was
 				 * modified while it was unlocked.  The entry
 				 * may have been clipped, but NOT merged or
 				 * deleted.
 				 */
 				if (!vm_map_lookup_entry(map, saved_start,
 				    &next_entry))
 					KASSERT(false,
 					    ("vm_map_wire: lookup failed"));
 				first_entry = (entry == first_entry) ?
 				    next_entry : NULL;
 				for (entry = next_entry; entry->end < saved_end;
 				    entry = vm_map_entry_succ(entry)) {
 					/*
 					 * In case of failure, handle entries
 					 * that were not fully wired here;
 					 * fully wired entries are handled
 					 * later.
 					 */
 					if (rv != KERN_SUCCESS &&
 					    faddr < entry->end)
 						vm_map_wire_entry_failure(map,
 						    entry, faddr);
 				}
 			}
 			if (rv != KERN_SUCCESS) {
 				vm_map_wire_entry_failure(map, entry, faddr);
 				if (user_wire)
 					vm_map_wire_user_count_sub(npages);
 				end = entry->end;
 				goto done;
 			}
 		} else if (!user_wire ||
 			   (entry->eflags & MAP_ENTRY_USER_WIRED) == 0) {
 			entry->wired_count++;
 		}
 		/*
 		 * Check the map for holes in the specified region.
 		 * If holes_ok was specified, skip this check.
 		 */
 		next_entry = vm_map_entry_succ(entry);
 		if (!holes_ok &&
 		    entry->end < end && next_entry->start > entry->end) {
 			end = entry->end;
 			rv = KERN_INVALID_ADDRESS;
 			goto done;
 		}
 	}
 	rv = KERN_SUCCESS;
 done:
 	need_wakeup = false;
 	if (first_entry == NULL &&
 	    !vm_map_lookup_entry(map, start, &first_entry)) {
 		KASSERT(holes_ok, ("vm_map_wire: lookup failed"));
 		prev_entry = first_entry;
 		entry = vm_map_entry_succ(first_entry);
 	} else {
 		prev_entry = vm_map_entry_pred(first_entry);
 		entry = first_entry;
 	}
 	for (; entry->start < end;
 	    prev_entry = entry, entry = vm_map_entry_succ(entry)) {
 		/*
 		 * If holes_ok was specified, an empty
 		 * space in the unwired region could have been mapped
 		 * while the map lock was dropped for faulting in the
 		 * pages or draining MAP_ENTRY_IN_TRANSITION.
 		 * Moreover, another thread could be simultaneously
 		 * wiring this new mapping entry.  Detect these cases
 		 * and skip any entries marked as in transition not by us.
 		 */
 		if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 ||
 		    entry->wiring_thread != curthread) {
 			KASSERT(holes_ok,
 			    ("vm_map_wire: !HOLESOK and new/changed entry"));
 			continue;
 		}
 
 		if ((entry->eflags & MAP_ENTRY_WIRE_SKIPPED) != 0) {
 			/* do nothing */
 		} else if (rv == KERN_SUCCESS) {
 			if (user_wire)
 				entry->eflags |= MAP_ENTRY_USER_WIRED;
 		} else if (entry->wired_count == -1) {
 			/*
 			 * Wiring failed on this entry.  Thus, unwiring is
 			 * unnecessary.
 			 */
 			entry->wired_count = 0;
 		} else if (!user_wire ||
 		    (entry->eflags & MAP_ENTRY_USER_WIRED) == 0) {
 			/*
 			 * Undo the wiring.  Wiring succeeded on this entry
 			 * but failed on a later entry.  
 			 */
 			if (entry->wired_count == 1) {
 				vm_map_entry_unwire(map, entry);
 				if (user_wire)
 					vm_map_wire_user_count_sub(
 					    atop(entry->end - entry->start));
 			} else
 				entry->wired_count--;
 		}
 		KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0,
 		    ("vm_map_wire: in-transition flag missing %p", entry));
 		KASSERT(entry->wiring_thread == curthread,
 		    ("vm_map_wire: alien wire %p", entry));
 		entry->eflags &= ~(MAP_ENTRY_IN_TRANSITION |
 		    MAP_ENTRY_WIRE_SKIPPED);
 		entry->wiring_thread = NULL;
 		if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) {
 			entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP;
 			need_wakeup = true;
 		}
 		vm_map_try_merge_entries(map, prev_entry, entry);
 	}
 	vm_map_try_merge_entries(map, prev_entry, entry);
 	if (need_wakeup)
 		vm_map_wakeup(map);
 	return (rv);
 }
 
 /*
  * vm_map_sync
  *
  * Push any dirty cached pages in the address range to their pager.
  * If syncio is TRUE, dirty pages are written synchronously.
  * If invalidate is TRUE, any cached pages are freed as well.
  *
  * If the size of the region from start to end is zero, we are
  * supposed to flush all modified pages within the region containing
  * start.  Unfortunately, a region can be split or coalesced with
  * neighboring regions, making it difficult to determine what the
  * original region was.  Therefore, we approximate this requirement by
  * flushing the current region containing start.
  *
  * Returns an error if any part of the specified range is not mapped.
  */
 int
 vm_map_sync(
 	vm_map_t map,
 	vm_offset_t start,
 	vm_offset_t end,
 	boolean_t syncio,
 	boolean_t invalidate)
 {
 	vm_map_entry_t entry, first_entry, next_entry;
 	vm_size_t size;
 	vm_object_t object;
 	vm_ooffset_t offset;
 	unsigned int last_timestamp;
 	boolean_t failed;
 
 	vm_map_lock_read(map);
 	VM_MAP_RANGE_CHECK(map, start, end);
 	if (!vm_map_lookup_entry(map, start, &first_entry)) {
 		vm_map_unlock_read(map);
 		return (KERN_INVALID_ADDRESS);
 	} else if (start == end) {
 		start = first_entry->start;
 		end = first_entry->end;
 	}
 	/*
 	 * Make a first pass to check for user-wired memory and holes.
 	 */
 	for (entry = first_entry; entry->start < end; entry = next_entry) {
 		if (invalidate &&
 		    (entry->eflags & MAP_ENTRY_USER_WIRED) != 0) {
 			vm_map_unlock_read(map);
 			return (KERN_INVALID_ARGUMENT);
 		}
 		next_entry = vm_map_entry_succ(entry);
 		if (end > entry->end &&
 		    entry->end != next_entry->start) {
 			vm_map_unlock_read(map);
 			return (KERN_INVALID_ADDRESS);
 		}
 	}
 
 	if (invalidate)
 		pmap_remove(map->pmap, start, end);
 	failed = FALSE;
 
 	/*
 	 * Make a second pass, cleaning/uncaching pages from the indicated
 	 * objects as we go.
 	 */
 	for (entry = first_entry; entry->start < end;) {
 		offset = entry->offset + (start - entry->start);
 		size = (end <= entry->end ? end : entry->end) - start;
 		if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0) {
 			vm_map_t smap;
 			vm_map_entry_t tentry;
 			vm_size_t tsize;
 
 			smap = entry->object.sub_map;
 			vm_map_lock_read(smap);
 			(void) vm_map_lookup_entry(smap, offset, &tentry);
 			tsize = tentry->end - offset;
 			if (tsize < size)
 				size = tsize;
 			object = tentry->object.vm_object;
 			offset = tentry->offset + (offset - tentry->start);
 			vm_map_unlock_read(smap);
 		} else {
 			object = entry->object.vm_object;
 		}
 		vm_object_reference(object);
 		last_timestamp = map->timestamp;
 		vm_map_unlock_read(map);
 		if (!vm_object_sync(object, offset, size, syncio, invalidate))
 			failed = TRUE;
 		start += size;
 		vm_object_deallocate(object);
 		vm_map_lock_read(map);
 		if (last_timestamp == map->timestamp ||
 		    !vm_map_lookup_entry(map, start, &entry))
 			entry = vm_map_entry_succ(entry);
 	}
 
 	vm_map_unlock_read(map);
 	return (failed ? KERN_FAILURE : KERN_SUCCESS);
 }
 
 /*
  *	vm_map_entry_unwire:	[ internal use only ]
  *
  *	Make the region specified by this entry pageable.
  *
  *	The map in question should be locked.
  *	[This is the reason for this routine's existence.]
  */
 static void
 vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry)
 {
 	vm_size_t size;
 
 	VM_MAP_ASSERT_LOCKED(map);
 	KASSERT(entry->wired_count > 0,
 	    ("vm_map_entry_unwire: entry %p isn't wired", entry));
 
 	size = entry->end - entry->start;
 	if ((entry->eflags & MAP_ENTRY_USER_WIRED) != 0)
 		vm_map_wire_user_count_sub(atop(size));
 	pmap_unwire(map->pmap, entry->start, entry->end);
 	vm_object_unwire(entry->object.vm_object, entry->offset, size,
 	    PQ_ACTIVE);
 	entry->wired_count = 0;
 }
 
 static void
 vm_map_entry_deallocate(vm_map_entry_t entry, boolean_t system_map)
 {
 
 	if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0)
 		vm_object_deallocate(entry->object.vm_object);
 	uma_zfree(system_map ? kmapentzone : mapentzone, entry);
 }
 
 /*
  *	vm_map_entry_delete:	[ internal use only ]
  *
  *	Deallocate the given entry from the target map.
  */
 static void
 vm_map_entry_delete(vm_map_t map, vm_map_entry_t entry)
 {
 	vm_object_t object;
-	vm_pindex_t offidxstart, offidxend, count, size1;
+	vm_pindex_t offidxstart, offidxend, size1;
 	vm_size_t size;
 
 	vm_map_entry_unlink(map, entry, UNLINK_MERGE_NONE);
 	object = entry->object.vm_object;
 
 	if ((entry->eflags & MAP_ENTRY_GUARD) != 0) {
 		MPASS(entry->cred == NULL);
 		MPASS((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0);
 		MPASS(object == NULL);
 		vm_map_entry_deallocate(entry, map->system_map);
 		return;
 	}
 
 	size = entry->end - entry->start;
 	map->size -= size;
 
 	if (entry->cred != NULL) {
 		swap_release_by_cred(size, entry->cred);
 		crfree(entry->cred);
 	}
 
 	if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0 || object == NULL) {
 		entry->object.vm_object = NULL;
 	} else if ((object->flags & OBJ_ANON) != 0 ||
 	    object == kernel_object) {
 		KASSERT(entry->cred == NULL || object->cred == NULL ||
 		    (entry->eflags & MAP_ENTRY_NEEDS_COPY),
 		    ("OVERCOMMIT vm_map_entry_delete: both cred %p", entry));
-		count = atop(size);
 		offidxstart = OFF_TO_IDX(entry->offset);
-		offidxend = offidxstart + count;
+		offidxend = offidxstart + atop(size);
 		VM_OBJECT_WLOCK(object);
 		if (object->ref_count != 1 &&
 		    ((object->flags & OBJ_ONEMAPPING) != 0 ||
 		    object == kernel_object)) {
 			vm_object_collapse(object);
 
 			/*
 			 * The option OBJPR_NOTMAPPED can be passed here
 			 * because vm_map_delete() already performed
 			 * pmap_remove() on the only mapping to this range
 			 * of pages. 
 			 */
 			vm_object_page_remove(object, offidxstart, offidxend,
 			    OBJPR_NOTMAPPED);
-			if (object->type == OBJT_SWAP)
-				swap_pager_freespace(object, offidxstart,
-				    count);
 			if (offidxend >= object->size &&
 			    offidxstart < object->size) {
 				size1 = object->size;
 				object->size = offidxstart;
 				if (object->cred != NULL) {
 					size1 -= object->size;
 					KASSERT(object->charge >= ptoa(size1),
 					    ("object %p charge < 0", object));
 					swap_release_by_cred(ptoa(size1),
 					    object->cred);
 					object->charge -= ptoa(size1);
 				}
 			}
 		}
 		VM_OBJECT_WUNLOCK(object);
 	}
 	if (map->system_map)
 		vm_map_entry_deallocate(entry, TRUE);
 	else {
 		entry->defer_next = curthread->td_map_def_user;
 		curthread->td_map_def_user = entry;
 	}
 }
 
 /*
  *	vm_map_delete:	[ internal use only ]
  *
  *	Deallocates the given address range from the target
  *	map.
  */
 int
 vm_map_delete(vm_map_t map, vm_offset_t start, vm_offset_t end)
 {
 	vm_map_entry_t entry, next_entry;
 
 	VM_MAP_ASSERT_LOCKED(map);
 	if (start == end)
 		return (KERN_SUCCESS);
 
 	/*
 	 * Find the start of the region, and clip it.
 	 * Step through all entries in this region.
 	 */
 	for (entry = vm_map_lookup_clip_start(map, start, &entry);
 	    entry->start < end; entry = next_entry) {
 		/*
 		 * Wait for wiring or unwiring of an entry to complete.
 		 * Also wait for any system wirings to disappear on
 		 * user maps.
 		 */
 		if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0 ||
 		    (vm_map_pmap(map) != kernel_pmap &&
 		    vm_map_entry_system_wired_count(entry) != 0)) {
 			unsigned int last_timestamp;
 			vm_offset_t saved_start;
 
 			saved_start = entry->start;
 			entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
 			last_timestamp = map->timestamp;
 			(void) vm_map_unlock_and_wait(map, 0);
 			vm_map_lock(map);
 			if (last_timestamp + 1 != map->timestamp) {
 				/*
 				 * Look again for the entry because the map was
 				 * modified while it was unlocked.
 				 * Specifically, the entry may have been
 				 * clipped, merged, or deleted.
 				 */
 				next_entry = vm_map_lookup_clip_start(map,
 				    saved_start, &next_entry);
 			} else
 				next_entry = entry;
 			continue;
 		}
 		vm_map_clip_end(map, entry, end);
 		next_entry = vm_map_entry_succ(entry);
 
 		/*
 		 * Unwire before removing addresses from the pmap; otherwise,
 		 * unwiring will put the entries back in the pmap.
 		 */
 		if (entry->wired_count != 0)
 			vm_map_entry_unwire(map, entry);
 
 		/*
 		 * Remove mappings for the pages, but only if the
 		 * mappings could exist.  For instance, it does not
 		 * make sense to call pmap_remove() for guard entries.
 		 */
 		if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0 ||
 		    entry->object.vm_object != NULL)
 			pmap_remove(map->pmap, entry->start, entry->end);
 
 		if (entry->end == map->anon_loc)
 			map->anon_loc = entry->start;
 
 		/*
 		 * Delete the entry only after removing all pmap
 		 * entries pointing to its pages.  (Otherwise, its
 		 * page frames may be reallocated, and any modify bits
 		 * will be set in the wrong object!)
 		 */
 		vm_map_entry_delete(map, entry);
 	}
 	return (KERN_SUCCESS);
 }
 
 /*
  *	vm_map_remove:
  *
  *	Remove the given address range from the target map.
  *	This is the exported form of vm_map_delete.
  */
 int
 vm_map_remove(vm_map_t map, vm_offset_t start, vm_offset_t end)
 {
 	int result;
 
 	vm_map_lock(map);
 	VM_MAP_RANGE_CHECK(map, start, end);
 	result = vm_map_delete(map, start, end);
 	vm_map_unlock(map);
 	return (result);
 }
 
 /*
  *	vm_map_check_protection:
  *
  *	Assert that the target map allows the specified privilege on the
  *	entire address region given.  The entire region must be allocated.
  *
  *	WARNING!  This code does not and should not check whether the
  *	contents of the region is accessible.  For example a smaller file
  *	might be mapped into a larger address space.
  *
  *	NOTE!  This code is also called by munmap().
  *
  *	The map must be locked.  A read lock is sufficient.
  */
 boolean_t
 vm_map_check_protection(vm_map_t map, vm_offset_t start, vm_offset_t end,
 			vm_prot_t protection)
 {
 	vm_map_entry_t entry;
 	vm_map_entry_t tmp_entry;
 
 	if (!vm_map_lookup_entry(map, start, &tmp_entry))
 		return (FALSE);
 	entry = tmp_entry;
 
 	while (start < end) {
 		/*
 		 * No holes allowed!
 		 */
 		if (start < entry->start)
 			return (FALSE);
 		/*
 		 * Check protection associated with entry.
 		 */
 		if ((entry->protection & protection) != protection)
 			return (FALSE);
 		/* go to next entry */
 		start = entry->end;
 		entry = vm_map_entry_succ(entry);
 	}
 	return (TRUE);
 }
 
 
 /*
  *
  *	vm_map_copy_swap_object:
  *
  *	Copies a swap-backed object from an existing map entry to a
  *	new one.  Carries forward the swap charge.  May change the
  *	src object on return.
  */
 static void
 vm_map_copy_swap_object(vm_map_entry_t src_entry, vm_map_entry_t dst_entry,
     vm_offset_t size, vm_ooffset_t *fork_charge)
 {
 	vm_object_t src_object;
 	struct ucred *cred;
 	int charged;
 
 	src_object = src_entry->object.vm_object;
 	charged = ENTRY_CHARGED(src_entry);
 	if ((src_object->flags & OBJ_ANON) != 0) {
 		VM_OBJECT_WLOCK(src_object);
 		vm_object_collapse(src_object);
 		if ((src_object->flags & OBJ_ONEMAPPING) != 0) {
 			vm_object_split(src_entry);
 			src_object = src_entry->object.vm_object;
 		}
 		vm_object_reference_locked(src_object);
 		vm_object_clear_flag(src_object, OBJ_ONEMAPPING);
 		VM_OBJECT_WUNLOCK(src_object);
 	} else
 		vm_object_reference(src_object);
 	if (src_entry->cred != NULL &&
 	    !(src_entry->eflags & MAP_ENTRY_NEEDS_COPY)) {
 		KASSERT(src_object->cred == NULL,
 		    ("OVERCOMMIT: vm_map_copy_anon_entry: cred %p",
 		     src_object));
 		src_object->cred = src_entry->cred;
 		src_object->charge = size;
 	}
 	dst_entry->object.vm_object = src_object;
 	if (charged) {
 		cred = curthread->td_ucred;
 		crhold(cred);
 		dst_entry->cred = cred;
 		*fork_charge += size;
 		if (!(src_entry->eflags & MAP_ENTRY_NEEDS_COPY)) {
 			crhold(cred);
 			src_entry->cred = cred;
 			*fork_charge += size;
 		}
 	}
 }
 
 /*
  *	vm_map_copy_entry:
  *
  *	Copies the contents of the source entry to the destination
  *	entry.  The entries *must* be aligned properly.
  */
 static void
 vm_map_copy_entry(
 	vm_map_t src_map,
 	vm_map_t dst_map,
 	vm_map_entry_t src_entry,
 	vm_map_entry_t dst_entry,
 	vm_ooffset_t *fork_charge)
 {
 	vm_object_t src_object;
 	vm_map_entry_t fake_entry;
 	vm_offset_t size;
 
 	VM_MAP_ASSERT_LOCKED(dst_map);
 
 	if ((dst_entry->eflags|src_entry->eflags) & MAP_ENTRY_IS_SUB_MAP)
 		return;
 
 	if (src_entry->wired_count == 0 ||
 	    (src_entry->protection & VM_PROT_WRITE) == 0) {
 		/*
 		 * If the source entry is marked needs_copy, it is already
 		 * write-protected.
 		 */
 		if ((src_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0 &&
 		    (src_entry->protection & VM_PROT_WRITE) != 0) {
 			pmap_protect(src_map->pmap,
 			    src_entry->start,
 			    src_entry->end,
 			    src_entry->protection & ~VM_PROT_WRITE);
 		}
 
 		/*
 		 * Make a copy of the object.
 		 */
 		size = src_entry->end - src_entry->start;
 		if ((src_object = src_entry->object.vm_object) != NULL) {
 			if (src_object->type == OBJT_DEFAULT ||
 			    src_object->type == OBJT_SWAP) {
 				vm_map_copy_swap_object(src_entry, dst_entry,
 				    size, fork_charge);
 				/* May have split/collapsed, reload obj. */
 				src_object = src_entry->object.vm_object;
 			} else {
 				vm_object_reference(src_object);
 				dst_entry->object.vm_object = src_object;
 			}
 			src_entry->eflags |= MAP_ENTRY_COW |
 			    MAP_ENTRY_NEEDS_COPY;
 			dst_entry->eflags |= MAP_ENTRY_COW |
 			    MAP_ENTRY_NEEDS_COPY;
 			dst_entry->offset = src_entry->offset;
 			if (src_entry->eflags & MAP_ENTRY_WRITECNT) {
 				/*
 				 * MAP_ENTRY_WRITECNT cannot
 				 * indicate write reference from
 				 * src_entry, since the entry is
 				 * marked as needs copy.  Allocate a
 				 * fake entry that is used to
 				 * decrement object->un_pager writecount
 				 * at the appropriate time.  Attach
 				 * fake_entry to the deferred list.
 				 */
 				fake_entry = vm_map_entry_create(dst_map);
 				fake_entry->eflags = MAP_ENTRY_WRITECNT;
 				src_entry->eflags &= ~MAP_ENTRY_WRITECNT;
 				vm_object_reference(src_object);
 				fake_entry->object.vm_object = src_object;
 				fake_entry->start = src_entry->start;
 				fake_entry->end = src_entry->end;
 				fake_entry->defer_next =
 				    curthread->td_map_def_user;
 				curthread->td_map_def_user = fake_entry;
 			}
 
 			pmap_copy(dst_map->pmap, src_map->pmap,
 			    dst_entry->start, dst_entry->end - dst_entry->start,
 			    src_entry->start);
 		} else {
 			dst_entry->object.vm_object = NULL;
 			dst_entry->offset = 0;
 			if (src_entry->cred != NULL) {
 				dst_entry->cred = curthread->td_ucred;
 				crhold(dst_entry->cred);
 				*fork_charge += size;
 			}
 		}
 	} else {
 		/*
 		 * We don't want to make writeable wired pages copy-on-write.
 		 * Immediately copy these pages into the new map by simulating
 		 * page faults.  The new pages are pageable.
 		 */
 		vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry,
 		    fork_charge);
 	}
 }
 
 /*
  * vmspace_map_entry_forked:
  * Update the newly-forked vmspace each time a map entry is inherited
  * or copied.  The values for vm_dsize and vm_tsize are approximate
  * (and mostly-obsolete ideas in the face of mmap(2) et al.)
  */
 static void
 vmspace_map_entry_forked(const struct vmspace *vm1, struct vmspace *vm2,
     vm_map_entry_t entry)
 {
 	vm_size_t entrysize;
 	vm_offset_t newend;
 
 	if ((entry->eflags & MAP_ENTRY_GUARD) != 0)
 		return;
 	entrysize = entry->end - entry->start;
 	vm2->vm_map.size += entrysize;
 	if (entry->eflags & (MAP_ENTRY_GROWS_DOWN | MAP_ENTRY_GROWS_UP)) {
 		vm2->vm_ssize += btoc(entrysize);
 	} else if (entry->start >= (vm_offset_t)vm1->vm_daddr &&
 	    entry->start < (vm_offset_t)vm1->vm_daddr + ctob(vm1->vm_dsize)) {
 		newend = MIN(entry->end,
 		    (vm_offset_t)vm1->vm_daddr + ctob(vm1->vm_dsize));
 		vm2->vm_dsize += btoc(newend - entry->start);
 	} else if (entry->start >= (vm_offset_t)vm1->vm_taddr &&
 	    entry->start < (vm_offset_t)vm1->vm_taddr + ctob(vm1->vm_tsize)) {
 		newend = MIN(entry->end,
 		    (vm_offset_t)vm1->vm_taddr + ctob(vm1->vm_tsize));
 		vm2->vm_tsize += btoc(newend - entry->start);
 	}
 }
 
 /*
  * vmspace_fork:
  * Create a new process vmspace structure and vm_map
  * based on those of an existing process.  The new map
  * is based on the old map, according to the inheritance
  * values on the regions in that map.
  *
  * XXX It might be worth coalescing the entries added to the new vmspace.
  *
  * The source map must not be locked.
  */
 struct vmspace *
 vmspace_fork(struct vmspace *vm1, vm_ooffset_t *fork_charge)
 {
 	struct vmspace *vm2;
 	vm_map_t new_map, old_map;
 	vm_map_entry_t new_entry, old_entry;
 	vm_object_t object;
 	int error, locked;
 	vm_inherit_t inh;
 
 	old_map = &vm1->vm_map;
 	/* Copy immutable fields of vm1 to vm2. */
 	vm2 = vmspace_alloc(vm_map_min(old_map), vm_map_max(old_map),
 	    pmap_pinit);
 	if (vm2 == NULL)
 		return (NULL);
 
 	vm2->vm_taddr = vm1->vm_taddr;
 	vm2->vm_daddr = vm1->vm_daddr;
 	vm2->vm_maxsaddr = vm1->vm_maxsaddr;
 	vm_map_lock(old_map);
 	if (old_map->busy)
 		vm_map_wait_busy(old_map);
 	new_map = &vm2->vm_map;
 	locked = vm_map_trylock(new_map); /* trylock to silence WITNESS */
 	KASSERT(locked, ("vmspace_fork: lock failed"));
 
 	error = pmap_vmspace_copy(new_map->pmap, old_map->pmap);
 	if (error != 0) {
 		sx_xunlock(&old_map->lock);
 		sx_xunlock(&new_map->lock);
 		vm_map_process_deferred();
 		vmspace_free(vm2);
 		return (NULL);
 	}
 
 	new_map->anon_loc = old_map->anon_loc;
 
 	VM_MAP_ENTRY_FOREACH(old_entry, old_map) {
 		if ((old_entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0)
 			panic("vm_map_fork: encountered a submap");
 
 		inh = old_entry->inheritance;
 		if ((old_entry->eflags & MAP_ENTRY_GUARD) != 0 &&
 		    inh != VM_INHERIT_NONE)
 			inh = VM_INHERIT_COPY;
 
 		switch (inh) {
 		case VM_INHERIT_NONE:
 			break;
 
 		case VM_INHERIT_SHARE:
 			/*
 			 * Clone the entry, creating the shared object if
 			 * necessary.
 			 */
 			object = old_entry->object.vm_object;
 			if (object == NULL) {
 				vm_map_entry_back(old_entry);
 				object = old_entry->object.vm_object;
 			}
 
 			/*
 			 * Add the reference before calling vm_object_shadow
 			 * to insure that a shadow object is created.
 			 */
 			vm_object_reference(object);
 			if (old_entry->eflags & MAP_ENTRY_NEEDS_COPY) {
 				vm_object_shadow(&old_entry->object.vm_object,
 				    &old_entry->offset,
 				    old_entry->end - old_entry->start,
 				    old_entry->cred,
 				    /* Transfer the second reference too. */
 				    true);
 				old_entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
 				old_entry->cred = NULL;
 
 				/*
 				 * As in vm_map_merged_neighbor_dispose(),
 				 * the vnode lock will not be acquired in
 				 * this call to vm_object_deallocate().
 				 */
 				vm_object_deallocate(object);
 				object = old_entry->object.vm_object;
 			} else {
 				VM_OBJECT_WLOCK(object);
 				vm_object_clear_flag(object, OBJ_ONEMAPPING);
 				if (old_entry->cred != NULL) {
 					KASSERT(object->cred == NULL,
 					    ("vmspace_fork both cred"));
 					object->cred = old_entry->cred;
 					object->charge = old_entry->end -
 					    old_entry->start;
 					old_entry->cred = NULL;
 				}
 
 				/*
 				 * Assert the correct state of the vnode
 				 * v_writecount while the object is locked, to
 				 * not relock it later for the assertion
 				 * correctness.
 				 */
 				if (old_entry->eflags & MAP_ENTRY_WRITECNT &&
 				    object->type == OBJT_VNODE) {
 					KASSERT(((struct vnode *)object->
 					    handle)->v_writecount > 0,
 					    ("vmspace_fork: v_writecount %p",
 					    object));
 					KASSERT(object->un_pager.vnp.
 					    writemappings > 0,
 					    ("vmspace_fork: vnp.writecount %p",
 					    object));
 				}
 				VM_OBJECT_WUNLOCK(object);
 			}
 
 			/*
 			 * Clone the entry, referencing the shared object.
 			 */
 			new_entry = vm_map_entry_create(new_map);
 			*new_entry = *old_entry;
 			new_entry->eflags &= ~(MAP_ENTRY_USER_WIRED |
 			    MAP_ENTRY_IN_TRANSITION);
 			new_entry->wiring_thread = NULL;
 			new_entry->wired_count = 0;
 			if (new_entry->eflags & MAP_ENTRY_WRITECNT) {
 				vm_pager_update_writecount(object,
 				    new_entry->start, new_entry->end);
 			}
 			vm_map_entry_set_vnode_text(new_entry, true);
 
 			/*
 			 * Insert the entry into the new map -- we know we're
 			 * inserting at the end of the new map.
 			 */
 			vm_map_entry_link(new_map, new_entry);
 			vmspace_map_entry_forked(vm1, vm2, new_entry);
 
 			/*
 			 * Update the physical map
 			 */
 			pmap_copy(new_map->pmap, old_map->pmap,
 			    new_entry->start,
 			    (old_entry->end - old_entry->start),
 			    old_entry->start);
 			break;
 
 		case VM_INHERIT_COPY:
 			/*
 			 * Clone the entry and link into the map.
 			 */
 			new_entry = vm_map_entry_create(new_map);
 			*new_entry = *old_entry;
 			/*
 			 * Copied entry is COW over the old object.
 			 */
 			new_entry->eflags &= ~(MAP_ENTRY_USER_WIRED |
 			    MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_WRITECNT);
 			new_entry->wiring_thread = NULL;
 			new_entry->wired_count = 0;
 			new_entry->object.vm_object = NULL;
 			new_entry->cred = NULL;
 			vm_map_entry_link(new_map, new_entry);
 			vmspace_map_entry_forked(vm1, vm2, new_entry);
 			vm_map_copy_entry(old_map, new_map, old_entry,
 			    new_entry, fork_charge);
 			vm_map_entry_set_vnode_text(new_entry, true);
 			break;
 
 		case VM_INHERIT_ZERO:
 			/*
 			 * Create a new anonymous mapping entry modelled from
 			 * the old one.
 			 */
 			new_entry = vm_map_entry_create(new_map);
 			memset(new_entry, 0, sizeof(*new_entry));
 
 			new_entry->start = old_entry->start;
 			new_entry->end = old_entry->end;
 			new_entry->eflags = old_entry->eflags &
 			    ~(MAP_ENTRY_USER_WIRED | MAP_ENTRY_IN_TRANSITION |
 			    MAP_ENTRY_WRITECNT | MAP_ENTRY_VN_EXEC);
 			new_entry->protection = old_entry->protection;
 			new_entry->max_protection = old_entry->max_protection;
 			new_entry->inheritance = VM_INHERIT_ZERO;
 
 			vm_map_entry_link(new_map, new_entry);
 			vmspace_map_entry_forked(vm1, vm2, new_entry);
 
 			new_entry->cred = curthread->td_ucred;
 			crhold(new_entry->cred);
 			*fork_charge += (new_entry->end - new_entry->start);
 
 			break;
 		}
 	}
 	/*
 	 * Use inlined vm_map_unlock() to postpone handling the deferred
 	 * map entries, which cannot be done until both old_map and
 	 * new_map locks are released.
 	 */
 	sx_xunlock(&old_map->lock);
 	sx_xunlock(&new_map->lock);
 	vm_map_process_deferred();
 
 	return (vm2);
 }
 
 /*
  * Create a process's stack for exec_new_vmspace().  This function is never
  * asked to wire the newly created stack.
  */
 int
 vm_map_stack(vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize,
     vm_prot_t prot, vm_prot_t max, int cow)
 {
 	vm_size_t growsize, init_ssize;
 	rlim_t vmemlim;
 	int rv;
 
 	MPASS((map->flags & MAP_WIREFUTURE) == 0);
 	growsize = sgrowsiz;
 	init_ssize = (max_ssize < growsize) ? max_ssize : growsize;
 	vm_map_lock(map);
 	vmemlim = lim_cur(curthread, RLIMIT_VMEM);
 	/* If we would blow our VMEM resource limit, no go */
 	if (map->size + init_ssize > vmemlim) {
 		rv = KERN_NO_SPACE;
 		goto out;
 	}
 	rv = vm_map_stack_locked(map, addrbos, max_ssize, growsize, prot,
 	    max, cow);
 out:
 	vm_map_unlock(map);
 	return (rv);
 }
 
 static int stack_guard_page = 1;
 SYSCTL_INT(_security_bsd, OID_AUTO, stack_guard_page, CTLFLAG_RWTUN,
     &stack_guard_page, 0,
     "Specifies the number of guard pages for a stack that grows");
 
 static int
 vm_map_stack_locked(vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize,
     vm_size_t growsize, vm_prot_t prot, vm_prot_t max, int cow)
 {
 	vm_map_entry_t new_entry, prev_entry;
 	vm_offset_t bot, gap_bot, gap_top, top;
 	vm_size_t init_ssize, sgp;
 	int orient, rv;
 
 	/*
 	 * The stack orientation is piggybacked with the cow argument.
 	 * Extract it into orient and mask the cow argument so that we
 	 * don't pass it around further.
 	 */
 	orient = cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP);
 	KASSERT(orient != 0, ("No stack grow direction"));
 	KASSERT(orient != (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP),
 	    ("bi-dir stack"));
 
 	if (max_ssize == 0 ||
 	    !vm_map_range_valid(map, addrbos, addrbos + max_ssize))
 		return (KERN_INVALID_ADDRESS);
 	sgp = ((curproc->p_flag2 & P2_STKGAP_DISABLE) != 0 ||
 	    (curproc->p_fctl0 & NT_FREEBSD_FCTL_STKGAP_DISABLE) != 0) ? 0 :
 	    (vm_size_t)stack_guard_page * PAGE_SIZE;
 	if (sgp >= max_ssize)
 		return (KERN_INVALID_ARGUMENT);
 
 	init_ssize = growsize;
 	if (max_ssize < init_ssize + sgp)
 		init_ssize = max_ssize - sgp;
 
 	/* If addr is already mapped, no go */
 	if (vm_map_lookup_entry(map, addrbos, &prev_entry))
 		return (KERN_NO_SPACE);
 
 	/*
 	 * If we can't accommodate max_ssize in the current mapping, no go.
 	 */
 	if (vm_map_entry_succ(prev_entry)->start < addrbos + max_ssize)
 		return (KERN_NO_SPACE);
 
 	/*
 	 * We initially map a stack of only init_ssize.  We will grow as
 	 * needed later.  Depending on the orientation of the stack (i.e.
 	 * the grow direction) we either map at the top of the range, the
 	 * bottom of the range or in the middle.
 	 *
 	 * Note: we would normally expect prot and max to be VM_PROT_ALL,
 	 * and cow to be 0.  Possibly we should eliminate these as input
 	 * parameters, and just pass these values here in the insert call.
 	 */
 	if (orient == MAP_STACK_GROWS_DOWN) {
 		bot = addrbos + max_ssize - init_ssize;
 		top = bot + init_ssize;
 		gap_bot = addrbos;
 		gap_top = bot;
 	} else /* if (orient == MAP_STACK_GROWS_UP) */ {
 		bot = addrbos;
 		top = bot + init_ssize;
 		gap_bot = top;
 		gap_top = addrbos + max_ssize;
 	}
 	rv = vm_map_insert(map, NULL, 0, bot, top, prot, max, cow);
 	if (rv != KERN_SUCCESS)
 		return (rv);
 	new_entry = vm_map_entry_succ(prev_entry);
 	KASSERT(new_entry->end == top || new_entry->start == bot,
 	    ("Bad entry start/end for new stack entry"));
 	KASSERT((orient & MAP_STACK_GROWS_DOWN) == 0 ||
 	    (new_entry->eflags & MAP_ENTRY_GROWS_DOWN) != 0,
 	    ("new entry lacks MAP_ENTRY_GROWS_DOWN"));
 	KASSERT((orient & MAP_STACK_GROWS_UP) == 0 ||
 	    (new_entry->eflags & MAP_ENTRY_GROWS_UP) != 0,
 	    ("new entry lacks MAP_ENTRY_GROWS_UP"));
 	if (gap_bot == gap_top)
 		return (KERN_SUCCESS);
 	rv = vm_map_insert(map, NULL, 0, gap_bot, gap_top, VM_PROT_NONE,
 	    VM_PROT_NONE, MAP_CREATE_GUARD | (orient == MAP_STACK_GROWS_DOWN ?
 	    MAP_CREATE_STACK_GAP_DN : MAP_CREATE_STACK_GAP_UP));
 	if (rv == KERN_SUCCESS) {
 		/*
 		 * Gap can never successfully handle a fault, so
 		 * read-ahead logic is never used for it.  Re-use
 		 * next_read of the gap entry to store
 		 * stack_guard_page for vm_map_growstack().
 		 */
 		if (orient == MAP_STACK_GROWS_DOWN)
 			vm_map_entry_pred(new_entry)->next_read = sgp;
 		else
 			vm_map_entry_succ(new_entry)->next_read = sgp;
 	} else {
 		(void)vm_map_delete(map, bot, top);
 	}
 	return (rv);
 }
 
 /*
  * Attempts to grow a vm stack entry.  Returns KERN_SUCCESS if we
  * successfully grow the stack.
  */
 static int
 vm_map_growstack(vm_map_t map, vm_offset_t addr, vm_map_entry_t gap_entry)
 {
 	vm_map_entry_t stack_entry;
 	struct proc *p;
 	struct vmspace *vm;
 	struct ucred *cred;
 	vm_offset_t gap_end, gap_start, grow_start;
 	vm_size_t grow_amount, guard, max_grow;
 	rlim_t lmemlim, stacklim, vmemlim;
 	int rv, rv1;
 	bool gap_deleted, grow_down, is_procstack;
 #ifdef notyet
 	uint64_t limit;
 #endif
 #ifdef RACCT
 	int error;
 #endif
 
 	p = curproc;
 	vm = p->p_vmspace;
 
 	/*
 	 * Disallow stack growth when the access is performed by a
 	 * debugger or AIO daemon.  The reason is that the wrong
 	 * resource limits are applied.
 	 */
 	if (p != initproc && (map != &p->p_vmspace->vm_map ||
 	    p->p_textvp == NULL))
 		return (KERN_FAILURE);
 
 	MPASS(!map->system_map);
 
 	lmemlim = lim_cur(curthread, RLIMIT_MEMLOCK);
 	stacklim = lim_cur(curthread, RLIMIT_STACK);
 	vmemlim = lim_cur(curthread, RLIMIT_VMEM);
 retry:
 	/* If addr is not in a hole for a stack grow area, no need to grow. */
 	if (gap_entry == NULL && !vm_map_lookup_entry(map, addr, &gap_entry))
 		return (KERN_FAILURE);
 	if ((gap_entry->eflags & MAP_ENTRY_GUARD) == 0)
 		return (KERN_SUCCESS);
 	if ((gap_entry->eflags & MAP_ENTRY_STACK_GAP_DN) != 0) {
 		stack_entry = vm_map_entry_succ(gap_entry);
 		if ((stack_entry->eflags & MAP_ENTRY_GROWS_DOWN) == 0 ||
 		    stack_entry->start != gap_entry->end)
 			return (KERN_FAILURE);
 		grow_amount = round_page(stack_entry->start - addr);
 		grow_down = true;
 	} else if ((gap_entry->eflags & MAP_ENTRY_STACK_GAP_UP) != 0) {
 		stack_entry = vm_map_entry_pred(gap_entry);
 		if ((stack_entry->eflags & MAP_ENTRY_GROWS_UP) == 0 ||
 		    stack_entry->end != gap_entry->start)
 			return (KERN_FAILURE);
 		grow_amount = round_page(addr + 1 - stack_entry->end);
 		grow_down = false;
 	} else {
 		return (KERN_FAILURE);
 	}
 	guard = ((curproc->p_flag2 & P2_STKGAP_DISABLE) != 0 ||
 	    (curproc->p_fctl0 & NT_FREEBSD_FCTL_STKGAP_DISABLE) != 0) ? 0 :
 	    gap_entry->next_read;
 	max_grow = gap_entry->end - gap_entry->start;
 	if (guard > max_grow)
 		return (KERN_NO_SPACE);
 	max_grow -= guard;
 	if (grow_amount > max_grow)
 		return (KERN_NO_SPACE);
 
 	/*
 	 * If this is the main process stack, see if we're over the stack
 	 * limit.
 	 */
 	is_procstack = addr >= (vm_offset_t)vm->vm_maxsaddr &&
 	    addr < (vm_offset_t)p->p_sysent->sv_usrstack;
 	if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > stacklim))
 		return (KERN_NO_SPACE);
 
 #ifdef RACCT
 	if (racct_enable) {
 		PROC_LOCK(p);
 		if (is_procstack && racct_set(p, RACCT_STACK,
 		    ctob(vm->vm_ssize) + grow_amount)) {
 			PROC_UNLOCK(p);
 			return (KERN_NO_SPACE);
 		}
 		PROC_UNLOCK(p);
 	}
 #endif
 
 	grow_amount = roundup(grow_amount, sgrowsiz);
 	if (grow_amount > max_grow)
 		grow_amount = max_grow;
 	if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > stacklim)) {
 		grow_amount = trunc_page((vm_size_t)stacklim) -
 		    ctob(vm->vm_ssize);
 	}
 
 #ifdef notyet
 	PROC_LOCK(p);
 	limit = racct_get_available(p, RACCT_STACK);
 	PROC_UNLOCK(p);
 	if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > limit))
 		grow_amount = limit - ctob(vm->vm_ssize);
 #endif
 
 	if (!old_mlock && (map->flags & MAP_WIREFUTURE) != 0) {
 		if (ptoa(pmap_wired_count(map->pmap)) + grow_amount > lmemlim) {
 			rv = KERN_NO_SPACE;
 			goto out;
 		}
 #ifdef RACCT
 		if (racct_enable) {
 			PROC_LOCK(p);
 			if (racct_set(p, RACCT_MEMLOCK,
 			    ptoa(pmap_wired_count(map->pmap)) + grow_amount)) {
 				PROC_UNLOCK(p);
 				rv = KERN_NO_SPACE;
 				goto out;
 			}
 			PROC_UNLOCK(p);
 		}
 #endif
 	}
 
 	/* If we would blow our VMEM resource limit, no go */
 	if (map->size + grow_amount > vmemlim) {
 		rv = KERN_NO_SPACE;
 		goto out;
 	}
 #ifdef RACCT
 	if (racct_enable) {
 		PROC_LOCK(p);
 		if (racct_set(p, RACCT_VMEM, map->size + grow_amount)) {
 			PROC_UNLOCK(p);
 			rv = KERN_NO_SPACE;
 			goto out;
 		}
 		PROC_UNLOCK(p);
 	}
 #endif
 
 	if (vm_map_lock_upgrade(map)) {
 		gap_entry = NULL;
 		vm_map_lock_read(map);
 		goto retry;
 	}
 
 	if (grow_down) {
 		grow_start = gap_entry->end - grow_amount;
 		if (gap_entry->start + grow_amount == gap_entry->end) {
 			gap_start = gap_entry->start;
 			gap_end = gap_entry->end;
 			vm_map_entry_delete(map, gap_entry);
 			gap_deleted = true;
 		} else {
 			MPASS(gap_entry->start < gap_entry->end - grow_amount);
 			vm_map_entry_resize(map, gap_entry, -grow_amount);
 			gap_deleted = false;
 		}
 		rv = vm_map_insert(map, NULL, 0, grow_start,
 		    grow_start + grow_amount,
 		    stack_entry->protection, stack_entry->max_protection,
 		    MAP_STACK_GROWS_DOWN);
 		if (rv != KERN_SUCCESS) {
 			if (gap_deleted) {
 				rv1 = vm_map_insert(map, NULL, 0, gap_start,
 				    gap_end, VM_PROT_NONE, VM_PROT_NONE,
 				    MAP_CREATE_GUARD | MAP_CREATE_STACK_GAP_DN);
 				MPASS(rv1 == KERN_SUCCESS);
 			} else
 				vm_map_entry_resize(map, gap_entry,
 				    grow_amount);
 		}
 	} else {
 		grow_start = stack_entry->end;
 		cred = stack_entry->cred;
 		if (cred == NULL && stack_entry->object.vm_object != NULL)
 			cred = stack_entry->object.vm_object->cred;
 		if (cred != NULL && !swap_reserve_by_cred(grow_amount, cred))
 			rv = KERN_NO_SPACE;
 		/* Grow the underlying object if applicable. */
 		else if (stack_entry->object.vm_object == NULL ||
 		    vm_object_coalesce(stack_entry->object.vm_object,
 		    stack_entry->offset,
 		    (vm_size_t)(stack_entry->end - stack_entry->start),
 		    grow_amount, cred != NULL)) {
 			if (gap_entry->start + grow_amount == gap_entry->end) {
 				vm_map_entry_delete(map, gap_entry);
 				vm_map_entry_resize(map, stack_entry,
 				    grow_amount);
 			} else {
 				gap_entry->start += grow_amount;
 				stack_entry->end += grow_amount;
 			}
 			map->size += grow_amount;
 			rv = KERN_SUCCESS;
 		} else
 			rv = KERN_FAILURE;
 	}
 	if (rv == KERN_SUCCESS && is_procstack)
 		vm->vm_ssize += btoc(grow_amount);
 
 	/*
 	 * Heed the MAP_WIREFUTURE flag if it was set for this process.
 	 */
 	if (rv == KERN_SUCCESS && (map->flags & MAP_WIREFUTURE) != 0) {
 		rv = vm_map_wire_locked(map, grow_start,
 		    grow_start + grow_amount,
 		    VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
 	}
 	vm_map_lock_downgrade(map);
 
 out:
 #ifdef RACCT
 	if (racct_enable && rv != KERN_SUCCESS) {
 		PROC_LOCK(p);
 		error = racct_set(p, RACCT_VMEM, map->size);
 		KASSERT(error == 0, ("decreasing RACCT_VMEM failed"));
 		if (!old_mlock) {
 			error = racct_set(p, RACCT_MEMLOCK,
 			    ptoa(pmap_wired_count(map->pmap)));
 			KASSERT(error == 0, ("decreasing RACCT_MEMLOCK failed"));
 		}
 	    	error = racct_set(p, RACCT_STACK, ctob(vm->vm_ssize));
 		KASSERT(error == 0, ("decreasing RACCT_STACK failed"));
 		PROC_UNLOCK(p);
 	}
 #endif
 
 	return (rv);
 }
 
 /*
  * Unshare the specified VM space for exec.  If other processes are
  * mapped to it, then create a new one.  The new vmspace is null.
  */
 int
 vmspace_exec(struct proc *p, vm_offset_t minuser, vm_offset_t maxuser)
 {
 	struct vmspace *oldvmspace = p->p_vmspace;
 	struct vmspace *newvmspace;
 
 	KASSERT((curthread->td_pflags & TDP_EXECVMSPC) == 0,
 	    ("vmspace_exec recursed"));
 	newvmspace = vmspace_alloc(minuser, maxuser, pmap_pinit);
 	if (newvmspace == NULL)
 		return (ENOMEM);
 	newvmspace->vm_swrss = oldvmspace->vm_swrss;
 	/*
 	 * This code is written like this for prototype purposes.  The
 	 * goal is to avoid running down the vmspace here, but let the
 	 * other process's that are still using the vmspace to finally
 	 * run it down.  Even though there is little or no chance of blocking
 	 * here, it is a good idea to keep this form for future mods.
 	 */
 	PROC_VMSPACE_LOCK(p);
 	p->p_vmspace = newvmspace;
 	PROC_VMSPACE_UNLOCK(p);
 	if (p == curthread->td_proc)
 		pmap_activate(curthread);
 	curthread->td_pflags |= TDP_EXECVMSPC;
 	return (0);
 }
 
 /*
  * Unshare the specified VM space for forcing COW.  This
  * is called by rfork, for the (RFMEM|RFPROC) == 0 case.
  */
 int
 vmspace_unshare(struct proc *p)
 {
 	struct vmspace *oldvmspace = p->p_vmspace;
 	struct vmspace *newvmspace;
 	vm_ooffset_t fork_charge;
 
 	if (oldvmspace->vm_refcnt == 1)
 		return (0);
 	fork_charge = 0;
 	newvmspace = vmspace_fork(oldvmspace, &fork_charge);
 	if (newvmspace == NULL)
 		return (ENOMEM);
 	if (!swap_reserve_by_cred(fork_charge, p->p_ucred)) {
 		vmspace_free(newvmspace);
 		return (ENOMEM);
 	}
 	PROC_VMSPACE_LOCK(p);
 	p->p_vmspace = newvmspace;
 	PROC_VMSPACE_UNLOCK(p);
 	if (p == curthread->td_proc)
 		pmap_activate(curthread);
 	vmspace_free(oldvmspace);
 	return (0);
 }
 
 /*
  *	vm_map_lookup:
  *
  *	Finds the VM object, offset, and
  *	protection for a given virtual address in the
  *	specified map, assuming a page fault of the
  *	type specified.
  *
  *	Leaves the map in question locked for read; return
  *	values are guaranteed until a vm_map_lookup_done
  *	call is performed.  Note that the map argument
  *	is in/out; the returned map must be used in
  *	the call to vm_map_lookup_done.
  *
  *	A handle (out_entry) is returned for use in
  *	vm_map_lookup_done, to make that fast.
  *
  *	If a lookup is requested with "write protection"
  *	specified, the map may be changed to perform virtual
  *	copying operations, although the data referenced will
  *	remain the same.
  */
 int
 vm_map_lookup(vm_map_t *var_map,		/* IN/OUT */
 	      vm_offset_t vaddr,
 	      vm_prot_t fault_typea,
 	      vm_map_entry_t *out_entry,	/* OUT */
 	      vm_object_t *object,		/* OUT */
 	      vm_pindex_t *pindex,		/* OUT */
 	      vm_prot_t *out_prot,		/* OUT */
 	      boolean_t *wired)			/* OUT */
 {
 	vm_map_entry_t entry;
 	vm_map_t map = *var_map;
 	vm_prot_t prot;
 	vm_prot_t fault_type;
 	vm_object_t eobject;
 	vm_size_t size;
 	struct ucred *cred;
 
 RetryLookup:
 
 	vm_map_lock_read(map);
 
 RetryLookupLocked:
 	/*
 	 * Lookup the faulting address.
 	 */
 	if (!vm_map_lookup_entry(map, vaddr, out_entry)) {
 		vm_map_unlock_read(map);
 		return (KERN_INVALID_ADDRESS);
 	}
 
 	entry = *out_entry;
 
 	/*
 	 * Handle submaps.
 	 */
 	if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) {
 		vm_map_t old_map = map;
 
 		*var_map = map = entry->object.sub_map;
 		vm_map_unlock_read(old_map);
 		goto RetryLookup;
 	}
 
 	/*
 	 * Check whether this task is allowed to have this page.
 	 */
 	prot = entry->protection;
 	if ((fault_typea & VM_PROT_FAULT_LOOKUP) != 0) {
 		fault_typea &= ~VM_PROT_FAULT_LOOKUP;
 		if (prot == VM_PROT_NONE && map != kernel_map &&
 		    (entry->eflags & MAP_ENTRY_GUARD) != 0 &&
 		    (entry->eflags & (MAP_ENTRY_STACK_GAP_DN |
 		    MAP_ENTRY_STACK_GAP_UP)) != 0 &&
 		    vm_map_growstack(map, vaddr, entry) == KERN_SUCCESS)
 			goto RetryLookupLocked;
 	}
 	fault_type = fault_typea & VM_PROT_ALL;
 	if ((fault_type & prot) != fault_type || prot == VM_PROT_NONE) {
 		vm_map_unlock_read(map);
 		return (KERN_PROTECTION_FAILURE);
 	}
 	KASSERT((prot & VM_PROT_WRITE) == 0 || (entry->eflags &
 	    (MAP_ENTRY_USER_WIRED | MAP_ENTRY_NEEDS_COPY)) !=
 	    (MAP_ENTRY_USER_WIRED | MAP_ENTRY_NEEDS_COPY),
 	    ("entry %p flags %x", entry, entry->eflags));
 	if ((fault_typea & VM_PROT_COPY) != 0 &&
 	    (entry->max_protection & VM_PROT_WRITE) == 0 &&
 	    (entry->eflags & MAP_ENTRY_COW) == 0) {
 		vm_map_unlock_read(map);
 		return (KERN_PROTECTION_FAILURE);
 	}
 
 	/*
 	 * If this page is not pageable, we have to get it for all possible
 	 * accesses.
 	 */
 	*wired = (entry->wired_count != 0);
 	if (*wired)
 		fault_type = entry->protection;
 	size = entry->end - entry->start;
 
 	/*
 	 * If the entry was copy-on-write, we either ...
 	 */
 	if (entry->eflags & MAP_ENTRY_NEEDS_COPY) {
 		/*
 		 * If we want to write the page, we may as well handle that
 		 * now since we've got the map locked.
 		 *
 		 * If we don't need to write the page, we just demote the
 		 * permissions allowed.
 		 */
 		if ((fault_type & VM_PROT_WRITE) != 0 ||
 		    (fault_typea & VM_PROT_COPY) != 0) {
 			/*
 			 * Make a new object, and place it in the object
 			 * chain.  Note that no new references have appeared
 			 * -- one just moved from the map to the new
 			 * object.
 			 */
 			if (vm_map_lock_upgrade(map))
 				goto RetryLookup;
 
 			if (entry->cred == NULL) {
 				/*
 				 * The debugger owner is charged for
 				 * the memory.
 				 */
 				cred = curthread->td_ucred;
 				crhold(cred);
 				if (!swap_reserve_by_cred(size, cred)) {
 					crfree(cred);
 					vm_map_unlock(map);
 					return (KERN_RESOURCE_SHORTAGE);
 				}
 				entry->cred = cred;
 			}
 			eobject = entry->object.vm_object;
 			vm_object_shadow(&entry->object.vm_object,
 			    &entry->offset, size, entry->cred, false);
 			if (eobject == entry->object.vm_object) {
 				/*
 				 * The object was not shadowed.
 				 */
 				swap_release_by_cred(size, entry->cred);
 				crfree(entry->cred);
 			}
 			entry->cred = NULL;
 			entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
 
 			vm_map_lock_downgrade(map);
 		} else {
 			/*
 			 * We're attempting to read a copy-on-write page --
 			 * don't allow writes.
 			 */
 			prot &= ~VM_PROT_WRITE;
 		}
 	}
 
 	/*
 	 * Create an object if necessary.
 	 */
 	if (entry->object.vm_object == NULL && !map->system_map) {
 		if (vm_map_lock_upgrade(map))
 			goto RetryLookup;
 		entry->object.vm_object = vm_object_allocate_anon(atop(size),
 		    NULL, entry->cred, entry->cred != NULL ? size : 0);
 		entry->offset = 0;
 		entry->cred = NULL;
 		vm_map_lock_downgrade(map);
 	}
 
 	/*
 	 * Return the object/offset from this entry.  If the entry was
 	 * copy-on-write or empty, it has been fixed up.
 	 */
 	*pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset);
 	*object = entry->object.vm_object;
 
 	*out_prot = prot;
 	return (KERN_SUCCESS);
 }
 
 /*
  *	vm_map_lookup_locked:
  *
  *	Lookup the faulting address.  A version of vm_map_lookup that returns 
  *      KERN_FAILURE instead of blocking on map lock or memory allocation.
  */
 int
 vm_map_lookup_locked(vm_map_t *var_map,		/* IN/OUT */
 		     vm_offset_t vaddr,
 		     vm_prot_t fault_typea,
 		     vm_map_entry_t *out_entry,	/* OUT */
 		     vm_object_t *object,	/* OUT */
 		     vm_pindex_t *pindex,	/* OUT */
 		     vm_prot_t *out_prot,	/* OUT */
 		     boolean_t *wired)		/* OUT */
 {
 	vm_map_entry_t entry;
 	vm_map_t map = *var_map;
 	vm_prot_t prot;
 	vm_prot_t fault_type = fault_typea;
 
 	/*
 	 * Lookup the faulting address.
 	 */
 	if (!vm_map_lookup_entry(map, vaddr, out_entry))
 		return (KERN_INVALID_ADDRESS);
 
 	entry = *out_entry;
 
 	/*
 	 * Fail if the entry refers to a submap.
 	 */
 	if (entry->eflags & MAP_ENTRY_IS_SUB_MAP)
 		return (KERN_FAILURE);
 
 	/*
 	 * Check whether this task is allowed to have this page.
 	 */
 	prot = entry->protection;
 	fault_type &= VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE;
 	if ((fault_type & prot) != fault_type)
 		return (KERN_PROTECTION_FAILURE);
 
 	/*
 	 * If this page is not pageable, we have to get it for all possible
 	 * accesses.
 	 */
 	*wired = (entry->wired_count != 0);
 	if (*wired)
 		fault_type = entry->protection;
 
 	if (entry->eflags & MAP_ENTRY_NEEDS_COPY) {
 		/*
 		 * Fail if the entry was copy-on-write for a write fault.
 		 */
 		if (fault_type & VM_PROT_WRITE)
 			return (KERN_FAILURE);
 		/*
 		 * We're attempting to read a copy-on-write page --
 		 * don't allow writes.
 		 */
 		prot &= ~VM_PROT_WRITE;
 	}
 
 	/*
 	 * Fail if an object should be created.
 	 */
 	if (entry->object.vm_object == NULL && !map->system_map)
 		return (KERN_FAILURE);
 
 	/*
 	 * Return the object/offset from this entry.  If the entry was
 	 * copy-on-write or empty, it has been fixed up.
 	 */
 	*pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset);
 	*object = entry->object.vm_object;
 
 	*out_prot = prot;
 	return (KERN_SUCCESS);
 }
 
 /*
  *	vm_map_lookup_done:
  *
  *	Releases locks acquired by a vm_map_lookup
  *	(according to the handle returned by that lookup).
  */
 void
 vm_map_lookup_done(vm_map_t map, vm_map_entry_t entry)
 {
 	/*
 	 * Unlock the main-level map
 	 */
 	vm_map_unlock_read(map);
 }
 
 vm_offset_t
 vm_map_max_KBI(const struct vm_map *map)
 {
 
 	return (vm_map_max(map));
 }
 
 vm_offset_t
 vm_map_min_KBI(const struct vm_map *map)
 {
 
 	return (vm_map_min(map));
 }
 
 pmap_t
 vm_map_pmap_KBI(vm_map_t map)
 {
 
 	return (map->pmap);
 }
 
 #ifdef INVARIANTS
 static void
 _vm_map_assert_consistent(vm_map_t map, int check)
 {
 	vm_map_entry_t entry, prev;
 	vm_map_entry_t cur, header, lbound, ubound;
 	vm_size_t max_left, max_right;
 
 #ifdef DIAGNOSTIC
 	++map->nupdates;
 #endif
 	if (enable_vmmap_check != check)
 		return;
 
 	header = prev = &map->header;
 	VM_MAP_ENTRY_FOREACH(entry, map) {
 		KASSERT(prev->end <= entry->start,
 		    ("map %p prev->end = %jx, start = %jx", map,
 		    (uintmax_t)prev->end, (uintmax_t)entry->start));
 		KASSERT(entry->start < entry->end,
 		    ("map %p start = %jx, end = %jx", map,
 		    (uintmax_t)entry->start, (uintmax_t)entry->end));
 		KASSERT(entry->left == header ||
 		    entry->left->start < entry->start,
 		    ("map %p left->start = %jx, start = %jx", map,
 		    (uintmax_t)entry->left->start, (uintmax_t)entry->start));
 		KASSERT(entry->right == header ||
 		    entry->start < entry->right->start,
 		    ("map %p start = %jx, right->start = %jx", map,
 		    (uintmax_t)entry->start, (uintmax_t)entry->right->start));
 		cur = map->root;
 		lbound = ubound = header;
 		for (;;) {
 			if (entry->start < cur->start) {
 				ubound = cur;
 				cur = cur->left;
 				KASSERT(cur != lbound,
 				    ("map %p cannot find %jx",
 				    map, (uintmax_t)entry->start));
 			} else if (cur->end <= entry->start) {
 				lbound = cur;
 				cur = cur->right;
 				KASSERT(cur != ubound,
 				    ("map %p cannot find %jx",
 				    map, (uintmax_t)entry->start));
 			} else {
 				KASSERT(cur == entry,
 				    ("map %p cannot find %jx",
 				    map, (uintmax_t)entry->start));
 				break;
 			}
 		}
 		max_left = vm_map_entry_max_free_left(entry, lbound);
 		max_right = vm_map_entry_max_free_right(entry, ubound);
 		KASSERT(entry->max_free == vm_size_max(max_left, max_right),
 		    ("map %p max = %jx, max_left = %jx, max_right = %jx", map,
 		    (uintmax_t)entry->max_free,
 		    (uintmax_t)max_left, (uintmax_t)max_right));
 		prev = entry;
 	}
 	KASSERT(prev->end <= entry->start,
 	    ("map %p prev->end = %jx, start = %jx", map,
 	    (uintmax_t)prev->end, (uintmax_t)entry->start));
 }
 #endif
 
 #include "opt_ddb.h"
 #ifdef DDB
 #include <sys/kernel.h>
 
 #include <ddb/ddb.h>
 
 static void
 vm_map_print(vm_map_t map)
 {
 	vm_map_entry_t entry, prev;
 
 	db_iprintf("Task map %p: pmap=%p, nentries=%d, version=%u\n",
 	    (void *)map,
 	    (void *)map->pmap, map->nentries, map->timestamp);
 
 	db_indent += 2;
 	prev = &map->header;
 	VM_MAP_ENTRY_FOREACH(entry, map) {
 		db_iprintf("map entry %p: start=%p, end=%p, eflags=%#x, \n",
 		    (void *)entry, (void *)entry->start, (void *)entry->end,
 		    entry->eflags);
 		{
 			static const char * const inheritance_name[4] =
 			{"share", "copy", "none", "donate_copy"};
 
 			db_iprintf(" prot=%x/%x/%s",
 			    entry->protection,
 			    entry->max_protection,
 			    inheritance_name[(int)(unsigned char)
 			    entry->inheritance]);
 			if (entry->wired_count != 0)
 				db_printf(", wired");
 		}
 		if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) {
 			db_printf(", share=%p, offset=0x%jx\n",
 			    (void *)entry->object.sub_map,
 			    (uintmax_t)entry->offset);
 			if (prev == &map->header ||
 			    prev->object.sub_map !=
 				entry->object.sub_map) {
 				db_indent += 2;
 				vm_map_print((vm_map_t)entry->object.sub_map);
 				db_indent -= 2;
 			}
 		} else {
 			if (entry->cred != NULL)
 				db_printf(", ruid %d", entry->cred->cr_ruid);
 			db_printf(", object=%p, offset=0x%jx",
 			    (void *)entry->object.vm_object,
 			    (uintmax_t)entry->offset);
 			if (entry->object.vm_object && entry->object.vm_object->cred)
 				db_printf(", obj ruid %d charge %jx",
 				    entry->object.vm_object->cred->cr_ruid,
 				    (uintmax_t)entry->object.vm_object->charge);
 			if (entry->eflags & MAP_ENTRY_COW)
 				db_printf(", copy (%s)",
 				    (entry->eflags & MAP_ENTRY_NEEDS_COPY) ? "needed" : "done");
 			db_printf("\n");
 
 			if (prev == &map->header ||
 			    prev->object.vm_object !=
 				entry->object.vm_object) {
 				db_indent += 2;
 				vm_object_print((db_expr_t)(intptr_t)
 						entry->object.vm_object,
 						0, 0, (char *)0);
 				db_indent -= 2;
 			}
 		}
 		prev = entry;
 	}
 	db_indent -= 2;
 }
 
 DB_SHOW_COMMAND(map, map)
 {
 
 	if (!have_addr) {
 		db_printf("usage: show map <addr>\n");
 		return;
 	}
 	vm_map_print((vm_map_t)addr);
 }
 
 DB_SHOW_COMMAND(procvm, procvm)
 {
 	struct proc *p;
 
 	if (have_addr) {
 		p = db_lookup_proc(addr);
 	} else {
 		p = curproc;
 	}
 
 	db_printf("p = %p, vmspace = %p, map = %p, pmap = %p\n",
 	    (void *)p, (void *)p->p_vmspace, (void *)&p->p_vmspace->vm_map,
 	    (void *)vmspace_pmap(p->p_vmspace));
 
 	vm_map_print((vm_map_t)&p->p_vmspace->vm_map);
 }
 
 #endif /* DDB */
Index: head/sys/vm/vm_object.c
===================================================================
--- head/sys/vm/vm_object.c	(revision 362612)
+++ head/sys/vm/vm_object.c	(revision 362613)
@@ -1,2844 +1,2847 @@
 /*-
  * SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU)
  *
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * The Mach Operating System project at Carnegie-Mellon University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)vm_object.c	8.5 (Berkeley) 3/22/94
  *
  *
  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  * All rights reserved.
  *
  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
  *
  * Permission to use, copy, modify and distribute this software and
  * its documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  *
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  */
 
 /*
  *	Virtual memory object module.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_vm.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/blockcount.h>
 #include <sys/cpuset.h>
 #include <sys/lock.h>
 #include <sys/mman.h>
 #include <sys/mount.h>
 #include <sys/kernel.h>
 #include <sys/pctrie.h>
 #include <sys/sysctl.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>		/* for curproc, pageproc */
 #include <sys/refcount.h>
 #include <sys/socket.h>
 #include <sys/resourcevar.h>
 #include <sys/refcount.h>
 #include <sys/rwlock.h>
 #include <sys/user.h>
 #include <sys/vnode.h>
 #include <sys/vmmeter.h>
 #include <sys/sx.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_phys.h>
 #include <vm/vm_pagequeue.h>
 #include <vm/swap_pager.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_radix.h>
 #include <vm/vm_reserv.h>
 #include <vm/uma.h>
 
 static int old_msync;
 SYSCTL_INT(_vm, OID_AUTO, old_msync, CTLFLAG_RW, &old_msync, 0,
     "Use old (insecure) msync behavior");
 
 static int	vm_object_page_collect_flush(vm_object_t object, vm_page_t p,
 		    int pagerflags, int flags, boolean_t *allclean,
 		    boolean_t *eio);
 static boolean_t vm_object_page_remove_write(vm_page_t p, int flags,
 		    boolean_t *allclean);
 static void	vm_object_backing_remove(vm_object_t object);
 
 /*
  *	Virtual memory objects maintain the actual data
  *	associated with allocated virtual memory.  A given
  *	page of memory exists within exactly one object.
  *
  *	An object is only deallocated when all "references"
  *	are given up.  Only one "reference" to a given
  *	region of an object should be writeable.
  *
  *	Associated with each object is a list of all resident
  *	memory pages belonging to that object; this list is
  *	maintained by the "vm_page" module, and locked by the object's
  *	lock.
  *
  *	Each object also records a "pager" routine which is
  *	used to retrieve (and store) pages to the proper backing
  *	storage.  In addition, objects may be backed by other
  *	objects from which they were virtual-copied.
  *
  *	The only items within the object structure which are
  *	modified after time of creation are:
  *		reference count		locked by object's lock
  *		pager routine		locked by object's lock
  *
  */
 
 struct object_q vm_object_list;
 struct mtx vm_object_list_mtx;	/* lock for object list and count */
 
 struct vm_object kernel_object_store;
 
 static SYSCTL_NODE(_vm_stats, OID_AUTO, object, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
     "VM object stats");
 
 static COUNTER_U64_DEFINE_EARLY(object_collapses);
 SYSCTL_COUNTER_U64(_vm_stats_object, OID_AUTO, collapses, CTLFLAG_RD,
     &object_collapses,
     "VM object collapses");
 
 static COUNTER_U64_DEFINE_EARLY(object_bypasses);
 SYSCTL_COUNTER_U64(_vm_stats_object, OID_AUTO, bypasses, CTLFLAG_RD,
     &object_bypasses,
     "VM object bypasses");
 
 static COUNTER_U64_DEFINE_EARLY(object_collapse_waits);
 SYSCTL_COUNTER_U64(_vm_stats_object, OID_AUTO, collapse_waits, CTLFLAG_RD,
     &object_collapse_waits,
     "Number of sleeps for collapse");
 
 static uma_zone_t obj_zone;
 
 static int vm_object_zinit(void *mem, int size, int flags);
 
 #ifdef INVARIANTS
 static void vm_object_zdtor(void *mem, int size, void *arg);
 
 static void
 vm_object_zdtor(void *mem, int size, void *arg)
 {
 	vm_object_t object;
 
 	object = (vm_object_t)mem;
 	KASSERT(object->ref_count == 0,
 	    ("object %p ref_count = %d", object, object->ref_count));
 	KASSERT(TAILQ_EMPTY(&object->memq),
 	    ("object %p has resident pages in its memq", object));
 	KASSERT(vm_radix_is_empty(&object->rtree),
 	    ("object %p has resident pages in its trie", object));
 #if VM_NRESERVLEVEL > 0
 	KASSERT(LIST_EMPTY(&object->rvq),
 	    ("object %p has reservations",
 	    object));
 #endif
 	KASSERT(blockcount_read(&object->paging_in_progress) == 0,
 	    ("object %p paging_in_progress = %d",
 	    object, blockcount_read(&object->paging_in_progress)));
 	KASSERT(!vm_object_busied(object),
 	    ("object %p busy = %d", object, blockcount_read(&object->busy)));
 	KASSERT(object->resident_page_count == 0,
 	    ("object %p resident_page_count = %d",
 	    object, object->resident_page_count));
 	KASSERT(object->shadow_count == 0,
 	    ("object %p shadow_count = %d",
 	    object, object->shadow_count));
 	KASSERT(object->type == OBJT_DEAD,
 	    ("object %p has non-dead type %d",
 	    object, object->type));
 }
 #endif
 
 static int
 vm_object_zinit(void *mem, int size, int flags)
 {
 	vm_object_t object;
 
 	object = (vm_object_t)mem;
 	rw_init_flags(&object->lock, "vm object", RW_DUPOK | RW_NEW);
 
 	/* These are true for any object that has been freed */
 	object->type = OBJT_DEAD;
 	vm_radix_init(&object->rtree);
 	refcount_init(&object->ref_count, 0);
 	blockcount_init(&object->paging_in_progress);
 	blockcount_init(&object->busy);
 	object->resident_page_count = 0;
 	object->shadow_count = 0;
 	object->flags = OBJ_DEAD;
 
 	mtx_lock(&vm_object_list_mtx);
 	TAILQ_INSERT_TAIL(&vm_object_list, object, object_list);
 	mtx_unlock(&vm_object_list_mtx);
 	return (0);
 }
 
 static void
 _vm_object_allocate(objtype_t type, vm_pindex_t size, u_short flags,
     vm_object_t object, void *handle)
 {
 
 	TAILQ_INIT(&object->memq);
 	LIST_INIT(&object->shadow_head);
 
 	object->type = type;
 	if (type == OBJT_SWAP)
 		pctrie_init(&object->un_pager.swp.swp_blks);
 
 	/*
 	 * Ensure that swap_pager_swapoff() iteration over object_list
 	 * sees up to date type and pctrie head if it observed
 	 * non-dead object.
 	 */
 	atomic_thread_fence_rel();
 
 	object->pg_color = 0;
 	object->flags = flags;
 	object->size = size;
 	object->domain.dr_policy = NULL;
 	object->generation = 1;
 	object->cleangeneration = 1;
 	refcount_init(&object->ref_count, 1);
 	object->memattr = VM_MEMATTR_DEFAULT;
 	object->cred = NULL;
 	object->charge = 0;
 	object->handle = handle;
 	object->backing_object = NULL;
 	object->backing_object_offset = (vm_ooffset_t) 0;
 #if VM_NRESERVLEVEL > 0
 	LIST_INIT(&object->rvq);
 #endif
 	umtx_shm_object_init(object);
 }
 
 /*
  *	vm_object_init:
  *
  *	Initialize the VM objects module.
  */
 void
 vm_object_init(void)
 {
 	TAILQ_INIT(&vm_object_list);
 	mtx_init(&vm_object_list_mtx, "vm object_list", NULL, MTX_DEF);
 	
 	rw_init(&kernel_object->lock, "kernel vm object");
 	_vm_object_allocate(OBJT_PHYS, atop(VM_MAX_KERNEL_ADDRESS -
 	    VM_MIN_KERNEL_ADDRESS), OBJ_UNMANAGED, kernel_object, NULL);
 #if VM_NRESERVLEVEL > 0
 	kernel_object->flags |= OBJ_COLORED;
 	kernel_object->pg_color = (u_short)atop(VM_MIN_KERNEL_ADDRESS);
 #endif
 
 	/*
 	 * The lock portion of struct vm_object must be type stable due
 	 * to vm_pageout_fallback_object_lock locking a vm object
 	 * without holding any references to it.
 	 */
 	obj_zone = uma_zcreate("VM OBJECT", sizeof (struct vm_object), NULL,
 #ifdef INVARIANTS
 	    vm_object_zdtor,
 #else
 	    NULL,
 #endif
 	    vm_object_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 
 	vm_radix_zinit();
 }
 
 void
 vm_object_clear_flag(vm_object_t object, u_short bits)
 {
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	object->flags &= ~bits;
 }
 
 /*
  *	Sets the default memory attribute for the specified object.  Pages
  *	that are allocated to this object are by default assigned this memory
  *	attribute.
  *
  *	Presently, this function must be called before any pages are allocated
  *	to the object.  In the future, this requirement may be relaxed for
  *	"default" and "swap" objects.
  */
 int
 vm_object_set_memattr(vm_object_t object, vm_memattr_t memattr)
 {
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	switch (object->type) {
 	case OBJT_DEFAULT:
 	case OBJT_DEVICE:
 	case OBJT_MGTDEVICE:
 	case OBJT_PHYS:
 	case OBJT_SG:
 	case OBJT_SWAP:
 	case OBJT_VNODE:
 		if (!TAILQ_EMPTY(&object->memq))
 			return (KERN_FAILURE);
 		break;
 	case OBJT_DEAD:
 		return (KERN_INVALID_ARGUMENT);
 	default:
 		panic("vm_object_set_memattr: object %p is of undefined type",
 		    object);
 	}
 	object->memattr = memattr;
 	return (KERN_SUCCESS);
 }
 
 void
 vm_object_pip_add(vm_object_t object, short i)
 {
 
 	if (i > 0)
 		blockcount_acquire(&object->paging_in_progress, i);
 }
 
 void
 vm_object_pip_wakeup(vm_object_t object)
 {
 
 	vm_object_pip_wakeupn(object, 1);
 }
 
 void
 vm_object_pip_wakeupn(vm_object_t object, short i)
 {
 
 	if (i > 0)
 		blockcount_release(&object->paging_in_progress, i);
 }
 
 /*
  * Atomically drop the object lock and wait for pip to drain.  This protects
  * from sleep/wakeup races due to identity changes.  The lock is not re-acquired
  * on return.
  */
 static void
 vm_object_pip_sleep(vm_object_t object, const char *waitid)
 {
 
 	(void)blockcount_sleep(&object->paging_in_progress, &object->lock,
 	    waitid, PVM | PDROP);
 }
 
 void
 vm_object_pip_wait(vm_object_t object, const char *waitid)
 {
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 
 	blockcount_wait(&object->paging_in_progress, &object->lock, waitid,
 	    PVM);
 }
 
 void
 vm_object_pip_wait_unlocked(vm_object_t object, const char *waitid)
 {
 
 	VM_OBJECT_ASSERT_UNLOCKED(object);
 
 	blockcount_wait(&object->paging_in_progress, NULL, waitid, PVM);
 }
 
 /*
  *	vm_object_allocate:
  *
  *	Returns a new object with the given size.
  */
 vm_object_t
 vm_object_allocate(objtype_t type, vm_pindex_t size)
 {
 	vm_object_t object;
 	u_short flags;
 
 	switch (type) {
 	case OBJT_DEAD:
 		panic("vm_object_allocate: can't create OBJT_DEAD");
 	case OBJT_DEFAULT:
 	case OBJT_SWAP:
 		flags = OBJ_COLORED;
 		break;
 	case OBJT_DEVICE:
 	case OBJT_SG:
 		flags = OBJ_FICTITIOUS | OBJ_UNMANAGED;
 		break;
 	case OBJT_MGTDEVICE:
 		flags = OBJ_FICTITIOUS;
 		break;
 	case OBJT_PHYS:
 		flags = OBJ_UNMANAGED;
 		break;
 	case OBJT_VNODE:
 		flags = 0;
 		break;
 	default:
 		panic("vm_object_allocate: type %d is undefined", type);
 	}
 	object = (vm_object_t)uma_zalloc(obj_zone, M_WAITOK);
 	_vm_object_allocate(type, size, flags, object, NULL);
 
 	return (object);
 }
 
 /*
  *	vm_object_allocate_anon:
  *
  *	Returns a new default object of the given size and marked as
  *	anonymous memory for special split/collapse handling.  Color
  *	to be initialized by the caller.
  */
 vm_object_t
 vm_object_allocate_anon(vm_pindex_t size, vm_object_t backing_object,
     struct ucred *cred, vm_size_t charge)
 {
 	vm_object_t handle, object;
 
 	if (backing_object == NULL)
 		handle = NULL;
 	else if ((backing_object->flags & OBJ_ANON) != 0)
 		handle = backing_object->handle;
 	else
 		handle = backing_object;
 	object = uma_zalloc(obj_zone, M_WAITOK);
 	_vm_object_allocate(OBJT_DEFAULT, size, OBJ_ANON | OBJ_ONEMAPPING,
 	    object, handle);
 	object->cred = cred;
 	object->charge = cred != NULL ? charge : 0;
 	return (object);
 }
 
 static void
 vm_object_reference_vnode(vm_object_t object)
 {
 	u_int old;
 
 	/*
 	 * vnode objects need the lock for the first reference
 	 * to serialize with vnode_object_deallocate().
 	 */
 	if (!refcount_acquire_if_gt(&object->ref_count, 0)) {
 		VM_OBJECT_RLOCK(object);
 		old = refcount_acquire(&object->ref_count);
 		if (object->type == OBJT_VNODE && old == 0)
 			vref(object->handle);
 		VM_OBJECT_RUNLOCK(object);
 	}
 }
 
 /*
  *	vm_object_reference:
  *
  *	Acquires a reference to the given object.
  */
 void
 vm_object_reference(vm_object_t object)
 {
 
 	if (object == NULL)
 		return;
 
 	if (object->type == OBJT_VNODE)
 		vm_object_reference_vnode(object);
 	else
 		refcount_acquire(&object->ref_count);
 	KASSERT((object->flags & OBJ_DEAD) == 0,
 	    ("vm_object_reference: Referenced dead object."));
 }
 
 /*
  *	vm_object_reference_locked:
  *
  *	Gets another reference to the given object.
  *
  *	The object must be locked.
  */
 void
 vm_object_reference_locked(vm_object_t object)
 {
 	u_int old;
 
 	VM_OBJECT_ASSERT_LOCKED(object);
 	old = refcount_acquire(&object->ref_count);
 	if (object->type == OBJT_VNODE && old == 0)
 		vref(object->handle);
 	KASSERT((object->flags & OBJ_DEAD) == 0,
 	    ("vm_object_reference: Referenced dead object."));
 }
 
 /*
  * Handle deallocating an object of type OBJT_VNODE.
  */
 static void
 vm_object_deallocate_vnode(vm_object_t object)
 {
 	struct vnode *vp = (struct vnode *) object->handle;
 	bool last;
 
 	KASSERT(object->type == OBJT_VNODE,
 	    ("vm_object_deallocate_vnode: not a vnode object"));
 	KASSERT(vp != NULL, ("vm_object_deallocate_vnode: missing vp"));
 
 	/* Object lock to protect handle lookup. */
 	last = refcount_release(&object->ref_count);
 	VM_OBJECT_RUNLOCK(object);
 
 	if (!last)
 		return;
 
 	if (!umtx_shm_vnobj_persistent)
 		umtx_shm_object_terminated(object);
 
 	/* vrele may need the vnode lock. */
 	vrele(vp);
 }
 
 
 /*
  * We dropped a reference on an object and discovered that it had a
  * single remaining shadow.  This is a sibling of the reference we
  * dropped.  Attempt to collapse the sibling and backing object.
  */
 static vm_object_t
 vm_object_deallocate_anon(vm_object_t backing_object)
 {
 	vm_object_t object;
 
 	/* Fetch the final shadow.  */
 	object = LIST_FIRST(&backing_object->shadow_head);
 	KASSERT(object != NULL && backing_object->shadow_count == 1,
 	    ("vm_object_anon_deallocate: ref_count: %d, shadow_count: %d",
 	    backing_object->ref_count, backing_object->shadow_count));
 	KASSERT((object->flags & (OBJ_TMPFS_NODE | OBJ_ANON)) == OBJ_ANON,
 	    ("invalid shadow object %p", object));
 
 	if (!VM_OBJECT_TRYWLOCK(object)) {
 		/*
 		 * Prevent object from disappearing since we do not have a
 		 * reference.
 		 */
 		vm_object_pip_add(object, 1);
 		VM_OBJECT_WUNLOCK(backing_object);
 		VM_OBJECT_WLOCK(object);
 		vm_object_pip_wakeup(object);
 	} else
 		VM_OBJECT_WUNLOCK(backing_object);
 
 	/*
 	 * Check for a collapse/terminate race with the last reference holder.
 	 */
 	if ((object->flags & (OBJ_DEAD | OBJ_COLLAPSING)) != 0 ||
 	    !refcount_acquire_if_not_zero(&object->ref_count)) {
 		VM_OBJECT_WUNLOCK(object);
 		return (NULL);
 	}
 	backing_object = object->backing_object;
 	if (backing_object != NULL && (backing_object->flags & OBJ_ANON) != 0)
 		vm_object_collapse(object);
 	VM_OBJECT_WUNLOCK(object);
 
 	return (object);
 }
 
 /*
  *	vm_object_deallocate:
  *
  *	Release a reference to the specified object,
  *	gained either through a vm_object_allocate
  *	or a vm_object_reference call.  When all references
  *	are gone, storage associated with this object
  *	may be relinquished.
  *
  *	No object may be locked.
  */
 void
 vm_object_deallocate(vm_object_t object)
 {
 	vm_object_t temp;
 	bool released;
 
 	while (object != NULL) {
 		/*
 		 * If the reference count goes to 0 we start calling
 		 * vm_object_terminate() on the object chain.  A ref count
 		 * of 1 may be a special case depending on the shadow count
 		 * being 0 or 1.  These cases require a write lock on the
 		 * object.
 		 */
 		if ((object->flags & OBJ_ANON) == 0)
 			released = refcount_release_if_gt(&object->ref_count, 1);
 		else
 			released = refcount_release_if_gt(&object->ref_count, 2);
 		if (released)
 			return;
 
 		if (object->type == OBJT_VNODE) {
 			VM_OBJECT_RLOCK(object);
 			if (object->type == OBJT_VNODE) {
 				vm_object_deallocate_vnode(object);
 				return;
 			}
 			VM_OBJECT_RUNLOCK(object);
 		}
 
 		VM_OBJECT_WLOCK(object);
 		KASSERT(object->ref_count > 0,
 		    ("vm_object_deallocate: object deallocated too many times: %d",
 		    object->type));
 
 		/*
 		 * If this is not the final reference to an anonymous
 		 * object we may need to collapse the shadow chain.
 		 */
 		if (!refcount_release(&object->ref_count)) {
 			if (object->ref_count > 1 ||
 			    object->shadow_count == 0) {
 				if ((object->flags & OBJ_ANON) != 0 &&
 				    object->ref_count == 1)
 					vm_object_set_flag(object,
 					    OBJ_ONEMAPPING);
 				VM_OBJECT_WUNLOCK(object);
 				return;
 			}
 
 			/* Handle collapsing last ref on anonymous objects. */
 			object = vm_object_deallocate_anon(object);
 			continue;
 		}
 
 		/*
 		 * Handle the final reference to an object.  We restart
 		 * the loop with the backing object to avoid recursion.
 		 */
 		umtx_shm_object_terminated(object);
 		temp = object->backing_object;
 		if (temp != NULL) {
 			KASSERT((object->flags & OBJ_TMPFS_NODE) == 0,
 			    ("shadowed tmpfs v_object 2 %p", object));
 			vm_object_backing_remove(object);
 		}
 
 		KASSERT((object->flags & OBJ_DEAD) == 0,
 		    ("vm_object_deallocate: Terminating dead object."));
 		vm_object_set_flag(object, OBJ_DEAD);
 		vm_object_terminate(object);
 		object = temp;
 	}
 }
 
 /*
  *	vm_object_destroy removes the object from the global object list
  *      and frees the space for the object.
  */
 void
 vm_object_destroy(vm_object_t object)
 {
 
 	/*
 	 * Release the allocation charge.
 	 */
 	if (object->cred != NULL) {
 		swap_release_by_cred(object->charge, object->cred);
 		object->charge = 0;
 		crfree(object->cred);
 		object->cred = NULL;
 	}
 
 	/*
 	 * Free the space for the object.
 	 */
 	uma_zfree(obj_zone, object);
 }
 
 static void
 vm_object_backing_remove_locked(vm_object_t object)
 {
 	vm_object_t backing_object;
 
 	backing_object = object->backing_object;
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	VM_OBJECT_ASSERT_WLOCKED(backing_object);
 
 	KASSERT((object->flags & OBJ_COLLAPSING) == 0,
 	    ("vm_object_backing_remove: Removing collapsing object."));
 
 	if ((object->flags & OBJ_SHADOWLIST) != 0) {
 		LIST_REMOVE(object, shadow_list);
 		backing_object->shadow_count--;
 		object->flags &= ~OBJ_SHADOWLIST;
 	}
 	object->backing_object = NULL;
 }
 
 static void
 vm_object_backing_remove(vm_object_t object)
 {
 	vm_object_t backing_object;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 
 	if ((object->flags & OBJ_SHADOWLIST) != 0) {
 		backing_object = object->backing_object;
 		VM_OBJECT_WLOCK(backing_object);
 		vm_object_backing_remove_locked(object);
 		VM_OBJECT_WUNLOCK(backing_object);
 	} else
 		object->backing_object = NULL;
 }
 
 static void
 vm_object_backing_insert_locked(vm_object_t object, vm_object_t backing_object)
 {
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 
 	if ((backing_object->flags & OBJ_ANON) != 0) {
 		VM_OBJECT_ASSERT_WLOCKED(backing_object);
 		LIST_INSERT_HEAD(&backing_object->shadow_head, object,
 		    shadow_list);
 		backing_object->shadow_count++;
 		object->flags |= OBJ_SHADOWLIST;
 	}
 	object->backing_object = backing_object;
 }
 
 static void
 vm_object_backing_insert(vm_object_t object, vm_object_t backing_object)
 {
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 
 	if ((backing_object->flags & OBJ_ANON) != 0) {
 		VM_OBJECT_WLOCK(backing_object);
 		vm_object_backing_insert_locked(object, backing_object);
 		VM_OBJECT_WUNLOCK(backing_object);
 	} else
 		object->backing_object = backing_object;
 }
 
 /*
  * Insert an object into a backing_object's shadow list with an additional
  * reference to the backing_object added.
  */
 static void
 vm_object_backing_insert_ref(vm_object_t object, vm_object_t backing_object)
 {
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 
 	if ((backing_object->flags & OBJ_ANON) != 0) {
 		VM_OBJECT_WLOCK(backing_object);
 		KASSERT((backing_object->flags & OBJ_DEAD) == 0,
 		    ("shadowing dead anonymous object"));
 		vm_object_reference_locked(backing_object);
 		vm_object_backing_insert_locked(object, backing_object);
 		vm_object_clear_flag(backing_object, OBJ_ONEMAPPING);
 		VM_OBJECT_WUNLOCK(backing_object);
 	} else {
 		vm_object_reference(backing_object);
 		object->backing_object = backing_object;
 	}
 }
 
 /*
  * Transfer a backing reference from backing_object to object.
  */
 static void
 vm_object_backing_transfer(vm_object_t object, vm_object_t backing_object)
 {
 	vm_object_t new_backing_object;
 
 	/*
 	 * Note that the reference to backing_object->backing_object
 	 * moves from within backing_object to within object.
 	 */
 	vm_object_backing_remove_locked(object);
 	new_backing_object = backing_object->backing_object;
 	if (new_backing_object == NULL)
 		return;
 	if ((new_backing_object->flags & OBJ_ANON) != 0) {
 		VM_OBJECT_WLOCK(new_backing_object);
 		vm_object_backing_remove_locked(backing_object);
 		vm_object_backing_insert_locked(object, new_backing_object);
 		VM_OBJECT_WUNLOCK(new_backing_object);
 	} else {
 		object->backing_object = new_backing_object;
 		backing_object->backing_object = NULL;
 	}
 }
 
 /*
  * Wait for a concurrent collapse to settle.
  */
 static void
 vm_object_collapse_wait(vm_object_t object)
 {
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 
 	while ((object->flags & OBJ_COLLAPSING) != 0) {
 		vm_object_pip_wait(object, "vmcolwait");
 		counter_u64_add(object_collapse_waits, 1);
 	}
 }
 
 /*
  * Waits for a backing object to clear a pending collapse and returns
  * it locked if it is an ANON object.
  */
 static vm_object_t
 vm_object_backing_collapse_wait(vm_object_t object)
 {
 	vm_object_t backing_object;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 
 	for (;;) {
 		backing_object = object->backing_object;
 		if (backing_object == NULL ||
 		    (backing_object->flags & OBJ_ANON) == 0)
 			return (NULL);
 		VM_OBJECT_WLOCK(backing_object);
 		if ((backing_object->flags & (OBJ_DEAD | OBJ_COLLAPSING)) == 0)
 			break;
 		VM_OBJECT_WUNLOCK(object);
 		vm_object_pip_sleep(backing_object, "vmbckwait");
 		counter_u64_add(object_collapse_waits, 1);
 		VM_OBJECT_WLOCK(object);
 	}
 	return (backing_object);
 }
 
 /*
  *	vm_object_terminate_pages removes any remaining pageable pages
  *	from the object and resets the object to an empty state.
  */
 static void
 vm_object_terminate_pages(vm_object_t object)
 {
 	vm_page_t p, p_next;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 
 	/*
 	 * Free any remaining pageable pages.  This also removes them from the
 	 * paging queues.  However, don't free wired pages, just remove them
 	 * from the object.  Rather than incrementally removing each page from
 	 * the object, the page and object are reset to any empty state. 
 	 */
 	TAILQ_FOREACH_SAFE(p, &object->memq, listq, p_next) {
 		vm_page_assert_unbusied(p);
 		KASSERT(p->object == object &&
 		    (p->ref_count & VPRC_OBJREF) != 0,
 		    ("vm_object_terminate_pages: page %p is inconsistent", p));
 
 		p->object = NULL;
 		if (vm_page_drop(p, VPRC_OBJREF) == VPRC_OBJREF) {
 			VM_CNT_INC(v_pfree);
 			vm_page_free(p);
 		}
 	}
 
 	/*
 	 * If the object contained any pages, then reset it to an empty state.
 	 * None of the object's fields, including "resident_page_count", were
 	 * modified by the preceding loop.
 	 */
 	if (object->resident_page_count != 0) {
 		vm_radix_reclaim_allnodes(&object->rtree);
 		TAILQ_INIT(&object->memq);
 		object->resident_page_count = 0;
 		if (object->type == OBJT_VNODE)
 			vdrop(object->handle);
 	}
 }
 
 /*
  *	vm_object_terminate actually destroys the specified object, freeing
  *	up all previously used resources.
  *
  *	The object must be locked.
  *	This routine may block.
  */
 void
 vm_object_terminate(vm_object_t object)
 {
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	KASSERT((object->flags & OBJ_DEAD) != 0,
 	    ("terminating non-dead obj %p", object));
 	KASSERT((object->flags & OBJ_COLLAPSING) == 0,
 	    ("terminating collapsing obj %p", object));
 	KASSERT(object->backing_object == NULL,
 	    ("terminating shadow obj %p", object));
 
 	/*
 	 * wait for the pageout daemon to be done with the object
 	 */
 	vm_object_pip_wait(object, "objtrm");
 
 	KASSERT(!blockcount_read(&object->paging_in_progress),
 	    ("vm_object_terminate: pageout in progress"));
 
 	KASSERT(object->ref_count == 0,
 	    ("vm_object_terminate: object with references, ref_count=%d",
 	    object->ref_count));
 
 	if ((object->flags & OBJ_PG_DTOR) == 0)
 		vm_object_terminate_pages(object);
 
 #if VM_NRESERVLEVEL > 0
 	if (__predict_false(!LIST_EMPTY(&object->rvq)))
 		vm_reserv_break_all(object);
 #endif
 
 	KASSERT(object->cred == NULL || object->type == OBJT_DEFAULT ||
 	    object->type == OBJT_SWAP,
 	    ("%s: non-swap obj %p has cred", __func__, object));
 
 	/*
 	 * Let the pager know object is dead.
 	 */
 	vm_pager_deallocate(object);
 	VM_OBJECT_WUNLOCK(object);
 
 	vm_object_destroy(object);
 }
 
 /*
  * Make the page read-only so that we can clear the object flags.  However, if
  * this is a nosync mmap then the object is likely to stay dirty so do not
  * mess with the page and do not clear the object flags.  Returns TRUE if the
  * page should be flushed, and FALSE otherwise.
  */
 static boolean_t
 vm_object_page_remove_write(vm_page_t p, int flags, boolean_t *allclean)
 {
 
 	vm_page_assert_busied(p);
 
 	/*
 	 * If we have been asked to skip nosync pages and this is a
 	 * nosync page, skip it.  Note that the object flags were not
 	 * cleared in this case so we do not have to set them.
 	 */
 	if ((flags & OBJPC_NOSYNC) != 0 && (p->a.flags & PGA_NOSYNC) != 0) {
 		*allclean = FALSE;
 		return (FALSE);
 	} else {
 		pmap_remove_write(p);
 		return (p->dirty != 0);
 	}
 }
 
 /*
  *	vm_object_page_clean
  *
  *	Clean all dirty pages in the specified range of object.  Leaves page 
  * 	on whatever queue it is currently on.   If NOSYNC is set then do not
  *	write out pages with PGA_NOSYNC set (originally comes from MAP_NOSYNC),
  *	leaving the object dirty.
  *
  *	For swap objects backing tmpfs regular files, do not flush anything,
  *	but remove write protection on the mapped pages to update mtime through
  *	mmaped writes.
  *
  *	When stuffing pages asynchronously, allow clustering.  XXX we need a
  *	synchronous clustering mode implementation.
  *
  *	Odd semantics: if start == end, we clean everything.
  *
  *	The object must be locked.
  *
  *	Returns FALSE if some page from the range was not written, as
  *	reported by the pager, and TRUE otherwise.
  */
 boolean_t
 vm_object_page_clean(vm_object_t object, vm_ooffset_t start, vm_ooffset_t end,
     int flags)
 {
 	vm_page_t np, p;
 	vm_pindex_t pi, tend, tstart;
 	int curgeneration, n, pagerflags;
 	boolean_t eio, res, allclean;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 
 	if (!vm_object_mightbedirty(object) || object->resident_page_count == 0)
 		return (TRUE);
 
 	pagerflags = (flags & (OBJPC_SYNC | OBJPC_INVAL)) != 0 ?
 	    VM_PAGER_PUT_SYNC : VM_PAGER_CLUSTER_OK;
 	pagerflags |= (flags & OBJPC_INVAL) != 0 ? VM_PAGER_PUT_INVAL : 0;
 
 	tstart = OFF_TO_IDX(start);
 	tend = (end == 0) ? object->size : OFF_TO_IDX(end + PAGE_MASK);
 	allclean = tstart == 0 && tend >= object->size;
 	res = TRUE;
 
 rescan:
 	curgeneration = object->generation;
 
 	for (p = vm_page_find_least(object, tstart); p != NULL; p = np) {
 		pi = p->pindex;
 		if (pi >= tend)
 			break;
 		np = TAILQ_NEXT(p, listq);
 		if (vm_page_none_valid(p))
 			continue;
 		if (vm_page_busy_acquire(p, VM_ALLOC_WAITFAIL) == 0) {
 			if (object->generation != curgeneration &&
 			    (flags & OBJPC_SYNC) != 0)
 				goto rescan;
 			np = vm_page_find_least(object, pi);
 			continue;
 		}
 		if (!vm_object_page_remove_write(p, flags, &allclean)) {
 			vm_page_xunbusy(p);
 			continue;
 		}
 		if (object->type == OBJT_VNODE) {
 			n = vm_object_page_collect_flush(object, p, pagerflags,
 			    flags, &allclean, &eio);
 			if (eio) {
 				res = FALSE;
 				allclean = FALSE;
 			}
 			if (object->generation != curgeneration &&
 			    (flags & OBJPC_SYNC) != 0)
 				goto rescan;
 
 			/*
 			 * If the VOP_PUTPAGES() did a truncated write, so
 			 * that even the first page of the run is not fully
 			 * written, vm_pageout_flush() returns 0 as the run
 			 * length.  Since the condition that caused truncated
 			 * write may be permanent, e.g. exhausted free space,
 			 * accepting n == 0 would cause an infinite loop.
 			 *
 			 * Forwarding the iterator leaves the unwritten page
 			 * behind, but there is not much we can do there if
 			 * filesystem refuses to write it.
 			 */
 			if (n == 0) {
 				n = 1;
 				allclean = FALSE;
 			}
 		} else {
 			n = 1;
 			vm_page_xunbusy(p);
 		}
 		np = vm_page_find_least(object, pi + n);
 	}
 #if 0
 	VOP_FSYNC(vp, (pagerflags & VM_PAGER_PUT_SYNC) ? MNT_WAIT : 0);
 #endif
 
 	/*
 	 * Leave updating cleangeneration for tmpfs objects to tmpfs
 	 * scan.  It needs to update mtime, which happens for other
 	 * filesystems during page writeouts.
 	 */
 	if (allclean && object->type == OBJT_VNODE)
 		object->cleangeneration = curgeneration;
 	return (res);
 }
 
 static int
 vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int pagerflags,
     int flags, boolean_t *allclean, boolean_t *eio)
 {
 	vm_page_t ma[vm_pageout_page_count], p_first, tp;
 	int count, i, mreq, runlen;
 
 	vm_page_lock_assert(p, MA_NOTOWNED);
 	vm_page_assert_xbusied(p);
 	VM_OBJECT_ASSERT_WLOCKED(object);
 
 	count = 1;
 	mreq = 0;
 
 	for (tp = p; count < vm_pageout_page_count; count++) {
 		tp = vm_page_next(tp);
 		if (tp == NULL || vm_page_tryxbusy(tp) == 0)
 			break;
 		if (!vm_object_page_remove_write(tp, flags, allclean)) {
 			vm_page_xunbusy(tp);
 			break;
 		}
 	}
 
 	for (p_first = p; count < vm_pageout_page_count; count++) {
 		tp = vm_page_prev(p_first);
 		if (tp == NULL || vm_page_tryxbusy(tp) == 0)
 			break;
 		if (!vm_object_page_remove_write(tp, flags, allclean)) {
 			vm_page_xunbusy(tp);
 			break;
 		}
 		p_first = tp;
 		mreq++;
 	}
 
 	for (tp = p_first, i = 0; i < count; tp = TAILQ_NEXT(tp, listq), i++)
 		ma[i] = tp;
 
 	vm_pageout_flush(ma, count, pagerflags, mreq, &runlen, eio);
 	return (runlen);
 }
 
 /*
  * Note that there is absolutely no sense in writing out
  * anonymous objects, so we track down the vnode object
  * to write out.
  * We invalidate (remove) all pages from the address space
  * for semantic correctness.
  *
  * If the backing object is a device object with unmanaged pages, then any
  * mappings to the specified range of pages must be removed before this
  * function is called.
  *
  * Note: certain anonymous maps, such as MAP_NOSYNC maps,
  * may start out with a NULL object.
  */
 boolean_t
 vm_object_sync(vm_object_t object, vm_ooffset_t offset, vm_size_t size,
     boolean_t syncio, boolean_t invalidate)
 {
 	vm_object_t backing_object;
 	struct vnode *vp;
 	struct mount *mp;
 	int error, flags, fsync_after;
 	boolean_t res;
 
 	if (object == NULL)
 		return (TRUE);
 	res = TRUE;
 	error = 0;
 	VM_OBJECT_WLOCK(object);
 	while ((backing_object = object->backing_object) != NULL) {
 		VM_OBJECT_WLOCK(backing_object);
 		offset += object->backing_object_offset;
 		VM_OBJECT_WUNLOCK(object);
 		object = backing_object;
 		if (object->size < OFF_TO_IDX(offset + size))
 			size = IDX_TO_OFF(object->size) - offset;
 	}
 	/*
 	 * Flush pages if writing is allowed, invalidate them
 	 * if invalidation requested.  Pages undergoing I/O
 	 * will be ignored by vm_object_page_remove().
 	 *
 	 * We cannot lock the vnode and then wait for paging
 	 * to complete without deadlocking against vm_fault.
 	 * Instead we simply call vm_object_page_remove() and
 	 * allow it to block internally on a page-by-page
 	 * basis when it encounters pages undergoing async
 	 * I/O.
 	 */
 	if (object->type == OBJT_VNODE &&
 	    vm_object_mightbedirty(object) != 0 &&
 	    ((vp = object->handle)->v_vflag & VV_NOSYNC) == 0) {
 		VM_OBJECT_WUNLOCK(object);
 		(void) vn_start_write(vp, &mp, V_WAIT);
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 		if (syncio && !invalidate && offset == 0 &&
 		    atop(size) == object->size) {
 			/*
 			 * If syncing the whole mapping of the file,
 			 * it is faster to schedule all the writes in
 			 * async mode, also allowing the clustering,
 			 * and then wait for i/o to complete.
 			 */
 			flags = 0;
 			fsync_after = TRUE;
 		} else {
 			flags = (syncio || invalidate) ? OBJPC_SYNC : 0;
 			flags |= invalidate ? (OBJPC_SYNC | OBJPC_INVAL) : 0;
 			fsync_after = FALSE;
 		}
 		VM_OBJECT_WLOCK(object);
 		res = vm_object_page_clean(object, offset, offset + size,
 		    flags);
 		VM_OBJECT_WUNLOCK(object);
 		if (fsync_after)
 			error = VOP_FSYNC(vp, MNT_WAIT, curthread);
 		VOP_UNLOCK(vp);
 		vn_finished_write(mp);
 		if (error != 0)
 			res = FALSE;
 		VM_OBJECT_WLOCK(object);
 	}
 	if ((object->type == OBJT_VNODE ||
 	     object->type == OBJT_DEVICE) && invalidate) {
 		if (object->type == OBJT_DEVICE)
 			/*
 			 * The option OBJPR_NOTMAPPED must be passed here
 			 * because vm_object_page_remove() cannot remove
 			 * unmanaged mappings.
 			 */
 			flags = OBJPR_NOTMAPPED;
 		else if (old_msync)
 			flags = 0;
 		else
 			flags = OBJPR_CLEANONLY;
 		vm_object_page_remove(object, OFF_TO_IDX(offset),
 		    OFF_TO_IDX(offset + size + PAGE_MASK), flags);
 	}
 	VM_OBJECT_WUNLOCK(object);
 	return (res);
 }
 
 /*
  * Determine whether the given advice can be applied to the object.  Advice is
  * not applied to unmanaged pages since they never belong to page queues, and
  * since MADV_FREE is destructive, it can apply only to anonymous pages that
  * have been mapped at most once.
  */
 static bool
 vm_object_advice_applies(vm_object_t object, int advice)
 {
 
 	if ((object->flags & OBJ_UNMANAGED) != 0)
 		return (false);
 	if (advice != MADV_FREE)
 		return (true);
 	return ((object->flags & (OBJ_ONEMAPPING | OBJ_ANON)) ==
 	    (OBJ_ONEMAPPING | OBJ_ANON));
 }
 
 static void
 vm_object_madvise_freespace(vm_object_t object, int advice, vm_pindex_t pindex,
     vm_size_t size)
 {
 
 	if (advice == MADV_FREE && object->type == OBJT_SWAP)
 		swap_pager_freespace(object, pindex, size);
 }
 
 /*
  *	vm_object_madvise:
  *
  *	Implements the madvise function at the object/page level.
  *
  *	MADV_WILLNEED	(any object)
  *
  *	    Activate the specified pages if they are resident.
  *
  *	MADV_DONTNEED	(any object)
  *
  *	    Deactivate the specified pages if they are resident.
  *
  *	MADV_FREE	(OBJT_DEFAULT/OBJT_SWAP objects,
  *			 OBJ_ONEMAPPING only)
  *
  *	    Deactivate and clean the specified pages if they are
  *	    resident.  This permits the process to reuse the pages
  *	    without faulting or the kernel to reclaim the pages
  *	    without I/O.
  */
 void
 vm_object_madvise(vm_object_t object, vm_pindex_t pindex, vm_pindex_t end,
     int advice)
 {
 	vm_pindex_t tpindex;
 	vm_object_t backing_object, tobject;
 	vm_page_t m, tm;
 
 	if (object == NULL)
 		return;
 
 relookup:
 	VM_OBJECT_WLOCK(object);
 	if (!vm_object_advice_applies(object, advice)) {
 		VM_OBJECT_WUNLOCK(object);
 		return;
 	}
 	for (m = vm_page_find_least(object, pindex); pindex < end; pindex++) {
 		tobject = object;
 
 		/*
 		 * If the next page isn't resident in the top-level object, we
 		 * need to search the shadow chain.  When applying MADV_FREE, we
 		 * take care to release any swap space used to store
 		 * non-resident pages.
 		 */
 		if (m == NULL || pindex < m->pindex) {
 			/*
 			 * Optimize a common case: if the top-level object has
 			 * no backing object, we can skip over the non-resident
 			 * range in constant time.
 			 */
 			if (object->backing_object == NULL) {
 				tpindex = (m != NULL && m->pindex < end) ?
 				    m->pindex : end;
 				vm_object_madvise_freespace(object, advice,
 				    pindex, tpindex - pindex);
 				if ((pindex = tpindex) == end)
 					break;
 				goto next_page;
 			}
 
 			tpindex = pindex;
 			do {
 				vm_object_madvise_freespace(tobject, advice,
 				    tpindex, 1);
 				/*
 				 * Prepare to search the next object in the
 				 * chain.
 				 */
 				backing_object = tobject->backing_object;
 				if (backing_object == NULL)
 					goto next_pindex;
 				VM_OBJECT_WLOCK(backing_object);
 				tpindex +=
 				    OFF_TO_IDX(tobject->backing_object_offset);
 				if (tobject != object)
 					VM_OBJECT_WUNLOCK(tobject);
 				tobject = backing_object;
 				if (!vm_object_advice_applies(tobject, advice))
 					goto next_pindex;
 			} while ((tm = vm_page_lookup(tobject, tpindex)) ==
 			    NULL);
 		} else {
 next_page:
 			tm = m;
 			m = TAILQ_NEXT(m, listq);
 		}
 
 		/*
 		 * If the page is not in a normal state, skip it.  The page
 		 * can not be invalidated while the object lock is held.
 		 */
 		if (!vm_page_all_valid(tm) || vm_page_wired(tm))
 			goto next_pindex;
 		KASSERT((tm->flags & PG_FICTITIOUS) == 0,
 		    ("vm_object_madvise: page %p is fictitious", tm));
 		KASSERT((tm->oflags & VPO_UNMANAGED) == 0,
 		    ("vm_object_madvise: page %p is not managed", tm));
 		if (vm_page_tryxbusy(tm) == 0) {
 			if (object != tobject)
 				VM_OBJECT_WUNLOCK(object);
 			if (advice == MADV_WILLNEED) {
 				/*
 				 * Reference the page before unlocking and
 				 * sleeping so that the page daemon is less
 				 * likely to reclaim it.
 				 */
 				vm_page_aflag_set(tm, PGA_REFERENCED);
 			}
 			vm_page_busy_sleep(tm, "madvpo", false);
   			goto relookup;
 		}
 		vm_page_advise(tm, advice);
 		vm_page_xunbusy(tm);
 		vm_object_madvise_freespace(tobject, advice, tm->pindex, 1);
 next_pindex:
 		if (tobject != object)
 			VM_OBJECT_WUNLOCK(tobject);
 	}
 	VM_OBJECT_WUNLOCK(object);
 }
 
 /*
  *	vm_object_shadow:
  *
  *	Create a new object which is backed by the
  *	specified existing object range.  The source
  *	object reference is deallocated.
  *
  *	The new object and offset into that object
  *	are returned in the source parameters.
  */
 void
 vm_object_shadow(vm_object_t *object, vm_ooffset_t *offset, vm_size_t length,
     struct ucred *cred, bool shared)
 {
 	vm_object_t source;
 	vm_object_t result;
 
 	source = *object;
 
 	/*
 	 * Don't create the new object if the old object isn't shared.
 	 *
 	 * If we hold the only reference we can guarantee that it won't
 	 * increase while we have the map locked.  Otherwise the race is
 	 * harmless and we will end up with an extra shadow object that
 	 * will be collapsed later.
 	 */
 	if (source != NULL && source->ref_count == 1 &&
 	    (source->flags & OBJ_ANON) != 0)
 		return;
 
 	/*
 	 * Allocate a new object with the given length.
 	 */
 	result = vm_object_allocate_anon(atop(length), source, cred, length);
 
 	/*
 	 * Store the offset into the source object, and fix up the offset into
 	 * the new object.
 	 */
 	result->backing_object_offset = *offset;
 
 	if (shared || source != NULL) {
 		VM_OBJECT_WLOCK(result);
 
 		/*
 		 * The new object shadows the source object, adding a
 		 * reference to it.  Our caller changes his reference
 		 * to point to the new object, removing a reference to
 		 * the source object.  Net result: no change of
 		 * reference count, unless the caller needs to add one
 		 * more reference due to forking a shared map entry.
 		 */
 		if (shared) {
 			vm_object_reference_locked(result);
 			vm_object_clear_flag(result, OBJ_ONEMAPPING);
 		}
 
 		/*
 		 * Try to optimize the result object's page color when
 		 * shadowing in order to maintain page coloring
 		 * consistency in the combined shadowed object.
 		 */
 		if (source != NULL) {
 			vm_object_backing_insert(result, source);
 			result->domain = source->domain;
 #if VM_NRESERVLEVEL > 0
 			result->flags |= source->flags & OBJ_COLORED;
 			result->pg_color = (source->pg_color +
 			    OFF_TO_IDX(*offset)) & ((1 << (VM_NFREEORDER -
 			    1)) - 1);
 #endif
 		}
 		VM_OBJECT_WUNLOCK(result);
 	}
 
 	/*
 	 * Return the new things
 	 */
 	*offset = 0;
 	*object = result;
 }
 
 /*
  *	vm_object_split:
  *
  * Split the pages in a map entry into a new object.  This affords
  * easier removal of unused pages, and keeps object inheritance from
  * being a negative impact on memory usage.
  */
 void
 vm_object_split(vm_map_entry_t entry)
 {
 	vm_page_t m, m_next;
 	vm_object_t orig_object, new_object, backing_object;
 	vm_pindex_t idx, offidxstart;
 	vm_size_t size;
 
 	orig_object = entry->object.vm_object;
 	KASSERT((orig_object->flags & OBJ_ONEMAPPING) != 0,
 	    ("vm_object_split:  Splitting object with multiple mappings."));
 	if ((orig_object->flags & OBJ_ANON) == 0)
 		return;
 	if (orig_object->ref_count <= 1)
 		return;
 	VM_OBJECT_WUNLOCK(orig_object);
 
 	offidxstart = OFF_TO_IDX(entry->offset);
 	size = atop(entry->end - entry->start);
 
 	/*
 	 * If swap_pager_copy() is later called, it will convert new_object
 	 * into a swap object.
 	 */
 	new_object = vm_object_allocate_anon(size, orig_object,
 	    orig_object->cred, ptoa(size));
 
 	/*
 	 * We must wait for the orig_object to complete any in-progress
 	 * collapse so that the swap blocks are stable below.  The
 	 * additional reference on backing_object by new object will
 	 * prevent further collapse operations until split completes.
 	 */
 	VM_OBJECT_WLOCK(orig_object);
 	vm_object_collapse_wait(orig_object);
 
 	/*
 	 * At this point, the new object is still private, so the order in
 	 * which the original and new objects are locked does not matter.
 	 */
 	VM_OBJECT_WLOCK(new_object);
 	new_object->domain = orig_object->domain;
 	backing_object = orig_object->backing_object;
 	if (backing_object != NULL) {
 		vm_object_backing_insert_ref(new_object, backing_object);
 		new_object->backing_object_offset = 
 		    orig_object->backing_object_offset + entry->offset;
 	}
 	if (orig_object->cred != NULL) {
 		crhold(orig_object->cred);
 		KASSERT(orig_object->charge >= ptoa(size),
 		    ("orig_object->charge < 0"));
 		orig_object->charge -= ptoa(size);
 	}
 
 	/*
 	 * Mark the split operation so that swap_pager_getpages() knows
 	 * that the object is in transition.
 	 */
 	vm_object_set_flag(orig_object, OBJ_SPLIT);
 retry:
 	m = vm_page_find_least(orig_object, offidxstart);
 	for (; m != NULL && (idx = m->pindex - offidxstart) < size;
 	    m = m_next) {
 		m_next = TAILQ_NEXT(m, listq);
 
 		/*
 		 * We must wait for pending I/O to complete before we can
 		 * rename the page.
 		 *
 		 * We do not have to VM_PROT_NONE the page as mappings should
 		 * not be changed by this operation.
 		 */
 		if (vm_page_tryxbusy(m) == 0) {
 			VM_OBJECT_WUNLOCK(new_object);
 			vm_page_sleep_if_busy(m, "spltwt");
 			VM_OBJECT_WLOCK(new_object);
 			goto retry;
 		}
 
 		/*
 		 * The page was left invalid.  Likely placed there by
 		 * an incomplete fault.  Just remove and ignore.
 		 */
 		if (vm_page_none_valid(m)) {
 			if (vm_page_remove(m))
 				vm_page_free(m);
 			continue;
 		}
 
 		/* vm_page_rename() will dirty the page. */
 		if (vm_page_rename(m, new_object, idx)) {
 			vm_page_xunbusy(m);
 			VM_OBJECT_WUNLOCK(new_object);
 			VM_OBJECT_WUNLOCK(orig_object);
 			vm_radix_wait();
 			VM_OBJECT_WLOCK(orig_object);
 			VM_OBJECT_WLOCK(new_object);
 			goto retry;
 		}
 
 #if VM_NRESERVLEVEL > 0
 		/*
 		 * If some of the reservation's allocated pages remain with
 		 * the original object, then transferring the reservation to
 		 * the new object is neither particularly beneficial nor
 		 * particularly harmful as compared to leaving the reservation
 		 * with the original object.  If, however, all of the
 		 * reservation's allocated pages are transferred to the new
 		 * object, then transferring the reservation is typically
 		 * beneficial.  Determining which of these two cases applies
 		 * would be more costly than unconditionally renaming the
 		 * reservation.
 		 */
 		vm_reserv_rename(m, new_object, orig_object, offidxstart);
 #endif
 		if (orig_object->type != OBJT_SWAP)
 			vm_page_xunbusy(m);
 	}
 	if (orig_object->type == OBJT_SWAP) {
 		/*
 		 * swap_pager_copy() can sleep, in which case the orig_object's
 		 * and new_object's locks are released and reacquired. 
 		 */
 		swap_pager_copy(orig_object, new_object, offidxstart, 0);
 		TAILQ_FOREACH(m, &new_object->memq, listq)
 			vm_page_xunbusy(m);
 	}
 	vm_object_clear_flag(orig_object, OBJ_SPLIT);
 	VM_OBJECT_WUNLOCK(orig_object);
 	VM_OBJECT_WUNLOCK(new_object);
 	entry->object.vm_object = new_object;
 	entry->offset = 0LL;
 	vm_object_deallocate(orig_object);
 	VM_OBJECT_WLOCK(new_object);
 }
 
 static vm_page_t
 vm_object_collapse_scan_wait(vm_object_t object, vm_page_t p)
 {
 	vm_object_t backing_object;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	backing_object = object->backing_object;
 	VM_OBJECT_ASSERT_WLOCKED(backing_object);
 
 	KASSERT(p == NULL || p->object == object || p->object == backing_object,
 	    ("invalid ownership %p %p %p", p, object, backing_object));
 	/* The page is only NULL when rename fails. */
 	if (p == NULL) {
 		VM_OBJECT_WUNLOCK(object);
 		VM_OBJECT_WUNLOCK(backing_object);
 		vm_radix_wait();
 	} else {
 		if (p->object == object)
 			VM_OBJECT_WUNLOCK(backing_object);
 		else
 			VM_OBJECT_WUNLOCK(object);
 		vm_page_busy_sleep(p, "vmocol", false);
 	}
 	VM_OBJECT_WLOCK(object);
 	VM_OBJECT_WLOCK(backing_object);
 	return (TAILQ_FIRST(&backing_object->memq));
 }
 
 static bool
 vm_object_scan_all_shadowed(vm_object_t object)
 {
 	vm_object_t backing_object;
 	vm_page_t p, pp;
 	vm_pindex_t backing_offset_index, new_pindex, pi, ps;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	VM_OBJECT_ASSERT_WLOCKED(object->backing_object);
 
 	backing_object = object->backing_object;
 
 	if ((backing_object->flags & OBJ_ANON) == 0)
 		return (false);
 
 	pi = backing_offset_index = OFF_TO_IDX(object->backing_object_offset);
 	p = vm_page_find_least(backing_object, pi);
 	ps = swap_pager_find_least(backing_object, pi);
 
 	/*
 	 * Only check pages inside the parent object's range and
 	 * inside the parent object's mapping of the backing object.
 	 */
 	for (;; pi++) {
 		if (p != NULL && p->pindex < pi)
 			p = TAILQ_NEXT(p, listq);
 		if (ps < pi)
 			ps = swap_pager_find_least(backing_object, pi);
 		if (p == NULL && ps >= backing_object->size)
 			break;
 		else if (p == NULL)
 			pi = ps;
 		else
 			pi = MIN(p->pindex, ps);
 
 		new_pindex = pi - backing_offset_index;
 		if (new_pindex >= object->size)
 			break;
 
 		if (p != NULL) {
 			/*
 			 * If the backing object page is busy a
 			 * grandparent or older page may still be
 			 * undergoing CoW.  It is not safe to collapse
 			 * the backing object until it is quiesced.
 			 */
 			if (vm_page_tryxbusy(p) == 0)
 				return (false);
 
 			/*
 			 * We raced with the fault handler that left
 			 * newly allocated invalid page on the object
 			 * queue and retried.
 			 */
 			if (!vm_page_all_valid(p))
 				goto unbusy_ret;
 		}
 
 		/*
 		 * See if the parent has the page or if the parent's object
 		 * pager has the page.  If the parent has the page but the page
 		 * is not valid, the parent's object pager must have the page.
 		 *
 		 * If this fails, the parent does not completely shadow the
 		 * object and we might as well give up now.
 		 */
 		pp = vm_page_lookup(object, new_pindex);
 
 		/*
 		 * The valid check here is stable due to object lock
 		 * being required to clear valid and initiate paging.
 		 * Busy of p disallows fault handler to validate pp.
 		 */
 		if ((pp == NULL || vm_page_none_valid(pp)) &&
 		    !vm_pager_has_page(object, new_pindex, NULL, NULL))
 			goto unbusy_ret;
 		if (p != NULL)
 			vm_page_xunbusy(p);
 	}
 	return (true);
 
 unbusy_ret:
 	if (p != NULL)
 		vm_page_xunbusy(p);
 	return (false);
 }
 
 static void
 vm_object_collapse_scan(vm_object_t object)
 {
 	vm_object_t backing_object;
 	vm_page_t next, p, pp;
 	vm_pindex_t backing_offset_index, new_pindex;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	VM_OBJECT_ASSERT_WLOCKED(object->backing_object);
 
 	backing_object = object->backing_object;
 	backing_offset_index = OFF_TO_IDX(object->backing_object_offset);
 
 	/*
 	 * Our scan
 	 */
 	for (p = TAILQ_FIRST(&backing_object->memq); p != NULL; p = next) {
 		next = TAILQ_NEXT(p, listq);
 		new_pindex = p->pindex - backing_offset_index;
 
 		/*
 		 * Check for busy page
 		 */
 		if (vm_page_tryxbusy(p) == 0) {
 			next = vm_object_collapse_scan_wait(object, p);
 			continue;
 		}
 
 		KASSERT(object->backing_object == backing_object,
 		    ("vm_object_collapse_scan: backing object mismatch %p != %p",
 		    object->backing_object, backing_object));
 		KASSERT(p->object == backing_object,
 		    ("vm_object_collapse_scan: object mismatch %p != %p",
 		    p->object, backing_object));
 
 		if (p->pindex < backing_offset_index ||
 		    new_pindex >= object->size) {
 			if (backing_object->type == OBJT_SWAP)
 				swap_pager_freespace(backing_object, p->pindex,
 				    1);
 
 			KASSERT(!pmap_page_is_mapped(p),
 			    ("freeing mapped page %p", p));
 			if (vm_page_remove(p))
 				vm_page_free(p);
 			continue;
 		}
 
 		if (!vm_page_all_valid(p)) {
 			KASSERT(!pmap_page_is_mapped(p),
 			    ("freeing mapped page %p", p));
 			if (vm_page_remove(p))
 				vm_page_free(p);
 			continue;
 		}
 
 		pp = vm_page_lookup(object, new_pindex);
 		if (pp != NULL && vm_page_tryxbusy(pp) == 0) {
 			vm_page_xunbusy(p);
 			/*
 			 * The page in the parent is busy and possibly not
 			 * (yet) valid.  Until its state is finalized by the
 			 * busy bit owner, we can't tell whether it shadows the
 			 * original page.
 			 */
 			next = vm_object_collapse_scan_wait(object, pp);
 			continue;
 		}
 
 		if (pp != NULL && vm_page_none_valid(pp)) {
 			/*
 			 * The page was invalid in the parent.  Likely placed
 			 * there by an incomplete fault.  Just remove and
 			 * ignore.  p can replace it.
 			 */
 			if (vm_page_remove(pp))
 				vm_page_free(pp);
 			pp = NULL;
 		}
 
 		if (pp != NULL || vm_pager_has_page(object, new_pindex, NULL,
 			NULL)) {
 			/*
 			 * The page already exists in the parent OR swap exists
 			 * for this location in the parent.  Leave the parent's
 			 * page alone.  Destroy the original page from the
 			 * backing object.
 			 */
 			if (backing_object->type == OBJT_SWAP)
 				swap_pager_freespace(backing_object, p->pindex,
 				    1);
 			KASSERT(!pmap_page_is_mapped(p),
 			    ("freeing mapped page %p", p));
 			if (vm_page_remove(p))
 				vm_page_free(p);
 			if (pp != NULL)
 				vm_page_xunbusy(pp);
 			continue;
 		}
 
 		/*
 		 * Page does not exist in parent, rename the page from the
 		 * backing object to the main object.
 		 *
 		 * If the page was mapped to a process, it can remain mapped
 		 * through the rename.  vm_page_rename() will dirty the page.
 		 */
 		if (vm_page_rename(p, object, new_pindex)) {
 			vm_page_xunbusy(p);
 			next = vm_object_collapse_scan_wait(object, NULL);
 			continue;
 		}
 
 		/* Use the old pindex to free the right page. */
 		if (backing_object->type == OBJT_SWAP)
 			swap_pager_freespace(backing_object,
 			    new_pindex + backing_offset_index, 1);
 
 #if VM_NRESERVLEVEL > 0
 		/*
 		 * Rename the reservation.
 		 */
 		vm_reserv_rename(p, object, backing_object,
 		    backing_offset_index);
 #endif
 		vm_page_xunbusy(p);
 	}
 	return;
 }
 
 /*
  *	vm_object_collapse:
  *
  *	Collapse an object with the object backing it.
  *	Pages in the backing object are moved into the
  *	parent, and the backing object is deallocated.
  */
 void
 vm_object_collapse(vm_object_t object)
 {
 	vm_object_t backing_object, new_backing_object;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 
 	while (TRUE) {
 		KASSERT((object->flags & (OBJ_DEAD | OBJ_ANON)) == OBJ_ANON,
 		    ("collapsing invalid object"));
 
 		/*
 		 * Wait for the backing_object to finish any pending
 		 * collapse so that the caller sees the shortest possible
 		 * shadow chain.
 		 */
 		backing_object = vm_object_backing_collapse_wait(object);
 		if (backing_object == NULL)
 			return;
 
 		KASSERT(object->ref_count > 0 &&
 		    object->ref_count > object->shadow_count,
 		    ("collapse with invalid ref %d or shadow %d count.",
 		    object->ref_count, object->shadow_count));
 		KASSERT((backing_object->flags &
 		    (OBJ_COLLAPSING | OBJ_DEAD)) == 0,
 		    ("vm_object_collapse: Backing object already collapsing."));
 		KASSERT((object->flags & (OBJ_COLLAPSING | OBJ_DEAD)) == 0,
 		    ("vm_object_collapse: object is already collapsing."));
 
 		/*
 		 * We know that we can either collapse the backing object if
 		 * the parent is the only reference to it, or (perhaps) have
 		 * the parent bypass the object if the parent happens to shadow
 		 * all the resident pages in the entire backing object.
 		 */
 		if (backing_object->ref_count == 1) {
 			KASSERT(backing_object->shadow_count == 1,
 			    ("vm_object_collapse: shadow_count: %d",
 			    backing_object->shadow_count));
 			vm_object_pip_add(object, 1);
 			vm_object_set_flag(object, OBJ_COLLAPSING);
 			vm_object_pip_add(backing_object, 1);
 			vm_object_set_flag(backing_object, OBJ_DEAD);
 
 			/*
 			 * If there is exactly one reference to the backing
 			 * object, we can collapse it into the parent.
 			 */
 			vm_object_collapse_scan(object);
 
 #if VM_NRESERVLEVEL > 0
 			/*
 			 * Break any reservations from backing_object.
 			 */
 			if (__predict_false(!LIST_EMPTY(&backing_object->rvq)))
 				vm_reserv_break_all(backing_object);
 #endif
 
 			/*
 			 * Move the pager from backing_object to object.
 			 */
 			if (backing_object->type == OBJT_SWAP) {
 				/*
 				 * swap_pager_copy() can sleep, in which case
 				 * the backing_object's and object's locks are
 				 * released and reacquired.
 				 * Since swap_pager_copy() is being asked to
 				 * destroy backing_object, it will change the
 				 * type to OBJT_DEFAULT.
 				 */
 				swap_pager_copy(
 				    backing_object,
 				    object,
 				    OFF_TO_IDX(object->backing_object_offset), TRUE);
 			}
 
 			/*
 			 * Object now shadows whatever backing_object did.
 			 */
 			vm_object_clear_flag(object, OBJ_COLLAPSING);
 			vm_object_backing_transfer(object, backing_object);
 			object->backing_object_offset +=
 			    backing_object->backing_object_offset;
 			VM_OBJECT_WUNLOCK(object);
 			vm_object_pip_wakeup(object);
 
 			/*
 			 * Discard backing_object.
 			 *
 			 * Since the backing object has no pages, no pager left,
 			 * and no object references within it, all that is
 			 * necessary is to dispose of it.
 			 */
 			KASSERT(backing_object->ref_count == 1, (
 "backing_object %p was somehow re-referenced during collapse!",
 			    backing_object));
 			vm_object_pip_wakeup(backing_object);
 			(void)refcount_release(&backing_object->ref_count);
 			vm_object_terminate(backing_object);
 			counter_u64_add(object_collapses, 1);
 			VM_OBJECT_WLOCK(object);
 		} else {
 			/*
 			 * If we do not entirely shadow the backing object,
 			 * there is nothing we can do so we give up.
 			 *
 			 * The object lock and backing_object lock must not
 			 * be dropped during this sequence.
 			 */
 			if (!vm_object_scan_all_shadowed(object)) {
 				VM_OBJECT_WUNLOCK(backing_object);
 				break;
 			}
 
 			/*
 			 * Make the parent shadow the next object in the
 			 * chain.  Deallocating backing_object will not remove
 			 * it, since its reference count is at least 2.
 			 */
 			vm_object_backing_remove_locked(object);
 			new_backing_object = backing_object->backing_object;
 			if (new_backing_object != NULL) {
 				vm_object_backing_insert_ref(object,
 				    new_backing_object);
 				object->backing_object_offset +=
 				    backing_object->backing_object_offset;
 			}
 
 			/*
 			 * Drop the reference count on backing_object. Since
 			 * its ref_count was at least 2, it will not vanish.
 			 */
 			(void)refcount_release(&backing_object->ref_count);
 			KASSERT(backing_object->ref_count >= 1, (
 "backing_object %p was somehow dereferenced during collapse!",
 			    backing_object));
 			VM_OBJECT_WUNLOCK(backing_object);
 			counter_u64_add(object_bypasses, 1);
 		}
 
 		/*
 		 * Try again with this object's new backing object.
 		 */
 	}
 }
 
 /*
  *	vm_object_page_remove:
  *
  *	For the given object, either frees or invalidates each of the
  *	specified pages.  In general, a page is freed.  However, if a page is
  *	wired for any reason other than the existence of a managed, wired
  *	mapping, then it may be invalidated but not removed from the object.
  *	Pages are specified by the given range ["start", "end") and the option
  *	OBJPR_CLEANONLY.  As a special case, if "end" is zero, then the range
  *	extends from "start" to the end of the object.  If the option
  *	OBJPR_CLEANONLY is specified, then only the non-dirty pages within the
  *	specified range are affected.  If the option OBJPR_NOTMAPPED is
  *	specified, then the pages within the specified range must have no
  *	mappings.  Otherwise, if this option is not specified, any mappings to
  *	the specified pages are removed before the pages are freed or
  *	invalidated.
  *
  *	In general, this operation should only be performed on objects that
  *	contain managed pages.  There are, however, two exceptions.  First, it
  *	is performed on the kernel and kmem objects by vm_map_entry_delete().
  *	Second, it is used by msync(..., MS_INVALIDATE) to invalidate device-
  *	backed pages.  In both of these cases, the option OBJPR_CLEANONLY must
  *	not be specified and the option OBJPR_NOTMAPPED must be specified.
  *
  *	The object must be locked.
  */
 void
 vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end,
     int options)
 {
 	vm_page_t p, next;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	KASSERT((object->flags & OBJ_UNMANAGED) == 0 ||
 	    (options & (OBJPR_CLEANONLY | OBJPR_NOTMAPPED)) == OBJPR_NOTMAPPED,
 	    ("vm_object_page_remove: illegal options for object %p", object));
 	if (object->resident_page_count == 0)
 		return;
 	vm_object_pip_add(object, 1);
 again:
 	p = vm_page_find_least(object, start);
 
 	/*
 	 * Here, the variable "p" is either (1) the page with the least pindex
 	 * greater than or equal to the parameter "start" or (2) NULL. 
 	 */
 	for (; p != NULL && (p->pindex < end || end == 0); p = next) {
 		next = TAILQ_NEXT(p, listq);
 
 		/*
 		 * If the page is wired for any reason besides the existence
 		 * of managed, wired mappings, then it cannot be freed.  For
 		 * example, fictitious pages, which represent device memory,
 		 * are inherently wired and cannot be freed.  They can,
 		 * however, be invalidated if the option OBJPR_CLEANONLY is
 		 * not specified.
 		 */
 		if (vm_page_tryxbusy(p) == 0) {
 			vm_page_sleep_if_busy(p, "vmopar");
 			goto again;
 		}
 		if (vm_page_wired(p)) {
 wired:
 			if ((options & OBJPR_NOTMAPPED) == 0 &&
 			    object->ref_count != 0)
 				pmap_remove_all(p);
 			if ((options & OBJPR_CLEANONLY) == 0) {
 				vm_page_invalid(p);
 				vm_page_undirty(p);
 			}
 			vm_page_xunbusy(p);
 			continue;
 		}
 		KASSERT((p->flags & PG_FICTITIOUS) == 0,
 		    ("vm_object_page_remove: page %p is fictitious", p));
 		if ((options & OBJPR_CLEANONLY) != 0 &&
 		    !vm_page_none_valid(p)) {
 			if ((options & OBJPR_NOTMAPPED) == 0 &&
 			    object->ref_count != 0 &&
 			    !vm_page_try_remove_write(p))
 				goto wired;
 			if (p->dirty != 0) {
 				vm_page_xunbusy(p);
 				continue;
 			}
 		}
 		if ((options & OBJPR_NOTMAPPED) == 0 &&
 		    object->ref_count != 0 && !vm_page_try_remove_all(p))
 			goto wired;
 		vm_page_free(p);
 	}
 	vm_object_pip_wakeup(object);
+
+	if (object->type == OBJT_SWAP) {
+		if (end == 0)
+			end = object->size;
+		swap_pager_freespace(object, start, end - start);
+	}
 }
 
 /*
  *	vm_object_page_noreuse:
  *
  *	For the given object, attempt to move the specified pages to
  *	the head of the inactive queue.  This bypasses regular LRU
  *	operation and allows the pages to be reused quickly under memory
  *	pressure.  If a page is wired for any reason, then it will not
  *	be queued.  Pages are specified by the range ["start", "end").
  *	As a special case, if "end" is zero, then the range extends from
  *	"start" to the end of the object.
  *
  *	This operation should only be performed on objects that
  *	contain non-fictitious, managed pages.
  *
  *	The object must be locked.
  */
 void
 vm_object_page_noreuse(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
 {
 	vm_page_t p, next;
 
 	VM_OBJECT_ASSERT_LOCKED(object);
 	KASSERT((object->flags & (OBJ_FICTITIOUS | OBJ_UNMANAGED)) == 0,
 	    ("vm_object_page_noreuse: illegal object %p", object));
 	if (object->resident_page_count == 0)
 		return;
 	p = vm_page_find_least(object, start);
 
 	/*
 	 * Here, the variable "p" is either (1) the page with the least pindex
 	 * greater than or equal to the parameter "start" or (2) NULL. 
 	 */
 	for (; p != NULL && (p->pindex < end || end == 0); p = next) {
 		next = TAILQ_NEXT(p, listq);
 		vm_page_deactivate_noreuse(p);
 	}
 }
 
 /*
  *	Populate the specified range of the object with valid pages.  Returns
  *	TRUE if the range is successfully populated and FALSE otherwise.
  *
  *	Note: This function should be optimized to pass a larger array of
  *	pages to vm_pager_get_pages() before it is applied to a non-
  *	OBJT_DEVICE object.
  *
  *	The object must be locked.
  */
 boolean_t
 vm_object_populate(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
 {
 	vm_page_t m;
 	vm_pindex_t pindex;
 	int rv;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	for (pindex = start; pindex < end; pindex++) {
 		rv = vm_page_grab_valid(&m, object, pindex, VM_ALLOC_NORMAL);
 		if (rv != VM_PAGER_OK)
 			break;
 
 		/*
 		 * Keep "m" busy because a subsequent iteration may unlock
 		 * the object.
 		 */
 	}
 	if (pindex > start) {
 		m = vm_page_lookup(object, start);
 		while (m != NULL && m->pindex < pindex) {
 			vm_page_xunbusy(m);
 			m = TAILQ_NEXT(m, listq);
 		}
 	}
 	return (pindex == end);
 }
 
 /*
  *	Routine:	vm_object_coalesce
  *	Function:	Coalesces two objects backing up adjoining
  *			regions of memory into a single object.
  *
  *	returns TRUE if objects were combined.
  *
  *	NOTE:	Only works at the moment if the second object is NULL -
  *		if it's not, which object do we lock first?
  *
  *	Parameters:
  *		prev_object	First object to coalesce
  *		prev_offset	Offset into prev_object
  *		prev_size	Size of reference to prev_object
  *		next_size	Size of reference to the second object
  *		reserved	Indicator that extension region has
  *				swap accounted for
  *
  *	Conditions:
  *	The object must *not* be locked.
  */
 boolean_t
 vm_object_coalesce(vm_object_t prev_object, vm_ooffset_t prev_offset,
     vm_size_t prev_size, vm_size_t next_size, boolean_t reserved)
 {
 	vm_pindex_t next_pindex;
 
 	if (prev_object == NULL)
 		return (TRUE);
 	if ((prev_object->flags & OBJ_ANON) == 0)
 		return (FALSE);
 
 	VM_OBJECT_WLOCK(prev_object);
 	/*
 	 * Try to collapse the object first.
 	 */
 	vm_object_collapse(prev_object);
 
 	/*
 	 * Can't coalesce if: . more than one reference . paged out . shadows
 	 * another object . has a copy elsewhere (any of which mean that the
 	 * pages not mapped to prev_entry may be in use anyway)
 	 */
 	if (prev_object->backing_object != NULL) {
 		VM_OBJECT_WUNLOCK(prev_object);
 		return (FALSE);
 	}
 
 	prev_size >>= PAGE_SHIFT;
 	next_size >>= PAGE_SHIFT;
 	next_pindex = OFF_TO_IDX(prev_offset) + prev_size;
 
 	if (prev_object->ref_count > 1 &&
 	    prev_object->size != next_pindex &&
 	    (prev_object->flags & OBJ_ONEMAPPING) == 0) {
 		VM_OBJECT_WUNLOCK(prev_object);
 		return (FALSE);
 	}
 
 	/*
 	 * Account for the charge.
 	 */
 	if (prev_object->cred != NULL) {
 
 		/*
 		 * If prev_object was charged, then this mapping,
 		 * although not charged now, may become writable
 		 * later. Non-NULL cred in the object would prevent
 		 * swap reservation during enabling of the write
 		 * access, so reserve swap now. Failed reservation
 		 * cause allocation of the separate object for the map
 		 * entry, and swap reservation for this entry is
 		 * managed in appropriate time.
 		 */
 		if (!reserved && !swap_reserve_by_cred(ptoa(next_size),
 		    prev_object->cred)) {
 			VM_OBJECT_WUNLOCK(prev_object);
 			return (FALSE);
 		}
 		prev_object->charge += ptoa(next_size);
 	}
 
 	/*
 	 * Remove any pages that may still be in the object from a previous
 	 * deallocation.
 	 */
 	if (next_pindex < prev_object->size) {
 		vm_object_page_remove(prev_object, next_pindex, next_pindex +
 		    next_size, 0);
-		if (prev_object->type == OBJT_SWAP)
-			swap_pager_freespace(prev_object,
-					     next_pindex, next_size);
 #if 0
 		if (prev_object->cred != NULL) {
 			KASSERT(prev_object->charge >=
 			    ptoa(prev_object->size - next_pindex),
 			    ("object %p overcharged 1 %jx %jx", prev_object,
 				(uintmax_t)next_pindex, (uintmax_t)next_size));
 			prev_object->charge -= ptoa(prev_object->size -
 			    next_pindex);
 		}
 #endif
 	}
 
 	/*
 	 * Extend the object if necessary.
 	 */
 	if (next_pindex + next_size > prev_object->size)
 		prev_object->size = next_pindex + next_size;
 
 	VM_OBJECT_WUNLOCK(prev_object);
 	return (TRUE);
 }
 
 void
 vm_object_set_writeable_dirty(vm_object_t object)
 {
 
 	/* Only set for vnodes & tmpfs */
 	if (object->type != OBJT_VNODE &&
 	    (object->flags & OBJ_TMPFS_NODE) == 0)
 		return;
 	atomic_add_int(&object->generation, 1);
 }
 
 /*
  *	vm_object_unwire:
  *
  *	For each page offset within the specified range of the given object,
  *	find the highest-level page in the shadow chain and unwire it.  A page
  *	must exist at every page offset, and the highest-level page must be
  *	wired.
  */
 void
 vm_object_unwire(vm_object_t object, vm_ooffset_t offset, vm_size_t length,
     uint8_t queue)
 {
 	vm_object_t tobject, t1object;
 	vm_page_t m, tm;
 	vm_pindex_t end_pindex, pindex, tpindex;
 	int depth, locked_depth;
 
 	KASSERT((offset & PAGE_MASK) == 0,
 	    ("vm_object_unwire: offset is not page aligned"));
 	KASSERT((length & PAGE_MASK) == 0,
 	    ("vm_object_unwire: length is not a multiple of PAGE_SIZE"));
 	/* The wired count of a fictitious page never changes. */
 	if ((object->flags & OBJ_FICTITIOUS) != 0)
 		return;
 	pindex = OFF_TO_IDX(offset);
 	end_pindex = pindex + atop(length);
 again:
 	locked_depth = 1;
 	VM_OBJECT_RLOCK(object);
 	m = vm_page_find_least(object, pindex);
 	while (pindex < end_pindex) {
 		if (m == NULL || pindex < m->pindex) {
 			/*
 			 * The first object in the shadow chain doesn't
 			 * contain a page at the current index.  Therefore,
 			 * the page must exist in a backing object.
 			 */
 			tobject = object;
 			tpindex = pindex;
 			depth = 0;
 			do {
 				tpindex +=
 				    OFF_TO_IDX(tobject->backing_object_offset);
 				tobject = tobject->backing_object;
 				KASSERT(tobject != NULL,
 				    ("vm_object_unwire: missing page"));
 				if ((tobject->flags & OBJ_FICTITIOUS) != 0)
 					goto next_page;
 				depth++;
 				if (depth == locked_depth) {
 					locked_depth++;
 					VM_OBJECT_RLOCK(tobject);
 				}
 			} while ((tm = vm_page_lookup(tobject, tpindex)) ==
 			    NULL);
 		} else {
 			tm = m;
 			m = TAILQ_NEXT(m, listq);
 		}
 		if (vm_page_trysbusy(tm) == 0) {
 			for (tobject = object; locked_depth >= 1;
 			    locked_depth--) {
 				t1object = tobject->backing_object;
 				if (tm->object != tobject)
 					VM_OBJECT_RUNLOCK(tobject);
 				tobject = t1object;
 			}
 			vm_page_busy_sleep(tm, "unwbo", true);
 			goto again;
 		}
 		vm_page_unwire(tm, queue);
 		vm_page_sunbusy(tm);
 next_page:
 		pindex++;
 	}
 	/* Release the accumulated object locks. */
 	for (tobject = object; locked_depth >= 1; locked_depth--) {
 		t1object = tobject->backing_object;
 		VM_OBJECT_RUNLOCK(tobject);
 		tobject = t1object;
 	}
 }
 
 /*
  * Return the vnode for the given object, or NULL if none exists.
  * For tmpfs objects, the function may return NULL if there is
  * no vnode allocated at the time of the call.
  */
 struct vnode *
 vm_object_vnode(vm_object_t object)
 {
 	struct vnode *vp;
 
 	VM_OBJECT_ASSERT_LOCKED(object);
 	if (object->type == OBJT_VNODE) {
 		vp = object->handle;
 		KASSERT(vp != NULL, ("%s: OBJT_VNODE has no vnode", __func__));
 	} else if (object->type == OBJT_SWAP &&
 	    (object->flags & OBJ_TMPFS) != 0) {
 		vp = object->un_pager.swp.swp_tmpfs;
 		KASSERT(vp != NULL, ("%s: OBJT_TMPFS has no vnode", __func__));
 	} else {
 		vp = NULL;
 	}
 	return (vp);
 }
 
 
 /*
  * Busy the vm object.  This prevents new pages belonging to the object from
  * becoming busy.  Existing pages persist as busy.  Callers are responsible
  * for checking page state before proceeding.
  */
 void
 vm_object_busy(vm_object_t obj)
 {
 
 	VM_OBJECT_ASSERT_LOCKED(obj);
 
 	blockcount_acquire(&obj->busy, 1);
 	/* The fence is required to order loads of page busy. */
 	atomic_thread_fence_acq_rel();
 }
 
 void
 vm_object_unbusy(vm_object_t obj)
 {
 
 	blockcount_release(&obj->busy, 1);
 }
 
 void
 vm_object_busy_wait(vm_object_t obj, const char *wmesg)
 {
 
 	VM_OBJECT_ASSERT_UNLOCKED(obj);
 
 	(void)blockcount_sleep(&obj->busy, NULL, wmesg, PVM);
 }
 
 /*
  * Return the kvme type of the given object.
  * If vpp is not NULL, set it to the object's vm_object_vnode() or NULL.
  */
 int
 vm_object_kvme_type(vm_object_t object, struct vnode **vpp)
 {
 
 	VM_OBJECT_ASSERT_LOCKED(object);
 	if (vpp != NULL)
 		*vpp = vm_object_vnode(object);
 	switch (object->type) {
 	case OBJT_DEFAULT:
 		return (KVME_TYPE_DEFAULT);
 	case OBJT_VNODE:
 		return (KVME_TYPE_VNODE);
 	case OBJT_SWAP:
 		if ((object->flags & OBJ_TMPFS_NODE) != 0)
 			return (KVME_TYPE_VNODE);
 		return (KVME_TYPE_SWAP);
 	case OBJT_DEVICE:
 		return (KVME_TYPE_DEVICE);
 	case OBJT_PHYS:
 		return (KVME_TYPE_PHYS);
 	case OBJT_DEAD:
 		return (KVME_TYPE_DEAD);
 	case OBJT_SG:
 		return (KVME_TYPE_SG);
 	case OBJT_MGTDEVICE:
 		return (KVME_TYPE_MGTDEVICE);
 	default:
 		return (KVME_TYPE_UNKNOWN);
 	}
 }
 
 static int
 sysctl_vm_object_list(SYSCTL_HANDLER_ARGS)
 {
 	struct kinfo_vmobject *kvo;
 	char *fullpath, *freepath;
 	struct vnode *vp;
 	struct vattr va;
 	vm_object_t obj;
 	vm_page_t m;
 	int count, error;
 
 	if (req->oldptr == NULL) {
 		/*
 		 * If an old buffer has not been provided, generate an
 		 * estimate of the space needed for a subsequent call.
 		 */
 		mtx_lock(&vm_object_list_mtx);
 		count = 0;
 		TAILQ_FOREACH(obj, &vm_object_list, object_list) {
 			if (obj->type == OBJT_DEAD)
 				continue;
 			count++;
 		}
 		mtx_unlock(&vm_object_list_mtx);
 		return (SYSCTL_OUT(req, NULL, sizeof(struct kinfo_vmobject) *
 		    count * 11 / 10));
 	}
 
 	kvo = malloc(sizeof(*kvo), M_TEMP, M_WAITOK);
 	error = 0;
 
 	/*
 	 * VM objects are type stable and are never removed from the
 	 * list once added.  This allows us to safely read obj->object_list
 	 * after reacquiring the VM object lock.
 	 */
 	mtx_lock(&vm_object_list_mtx);
 	TAILQ_FOREACH(obj, &vm_object_list, object_list) {
 		if (obj->type == OBJT_DEAD)
 			continue;
 		VM_OBJECT_RLOCK(obj);
 		if (obj->type == OBJT_DEAD) {
 			VM_OBJECT_RUNLOCK(obj);
 			continue;
 		}
 		mtx_unlock(&vm_object_list_mtx);
 		kvo->kvo_size = ptoa(obj->size);
 		kvo->kvo_resident = obj->resident_page_count;
 		kvo->kvo_ref_count = obj->ref_count;
 		kvo->kvo_shadow_count = obj->shadow_count;
 		kvo->kvo_memattr = obj->memattr;
 		kvo->kvo_active = 0;
 		kvo->kvo_inactive = 0;
 		TAILQ_FOREACH(m, &obj->memq, listq) {
 			/*
 			 * A page may belong to the object but be
 			 * dequeued and set to PQ_NONE while the
 			 * object lock is not held.  This makes the
 			 * reads of m->queue below racy, and we do not
 			 * count pages set to PQ_NONE.  However, this
 			 * sysctl is only meant to give an
 			 * approximation of the system anyway.
 			 */
 			if (m->a.queue == PQ_ACTIVE)
 				kvo->kvo_active++;
 			else if (m->a.queue == PQ_INACTIVE)
 				kvo->kvo_inactive++;
 		}
 
 		kvo->kvo_vn_fileid = 0;
 		kvo->kvo_vn_fsid = 0;
 		kvo->kvo_vn_fsid_freebsd11 = 0;
 		freepath = NULL;
 		fullpath = "";
 		kvo->kvo_type = vm_object_kvme_type(obj, &vp);
 		if (vp != NULL)
 			vref(vp);
 		VM_OBJECT_RUNLOCK(obj);
 		if (vp != NULL) {
 			vn_fullpath(curthread, vp, &fullpath, &freepath);
 			vn_lock(vp, LK_SHARED | LK_RETRY);
 			if (VOP_GETATTR(vp, &va, curthread->td_ucred) == 0) {
 				kvo->kvo_vn_fileid = va.va_fileid;
 				kvo->kvo_vn_fsid = va.va_fsid;
 				kvo->kvo_vn_fsid_freebsd11 = va.va_fsid;
 								/* truncate */
 			}
 			vput(vp);
 		}
 
 		strlcpy(kvo->kvo_path, fullpath, sizeof(kvo->kvo_path));
 		if (freepath != NULL)
 			free(freepath, M_TEMP);
 
 		/* Pack record size down */
 		kvo->kvo_structsize = offsetof(struct kinfo_vmobject, kvo_path)
 		    + strlen(kvo->kvo_path) + 1;
 		kvo->kvo_structsize = roundup(kvo->kvo_structsize,
 		    sizeof(uint64_t));
 		error = SYSCTL_OUT(req, kvo, kvo->kvo_structsize);
 		mtx_lock(&vm_object_list_mtx);
 		if (error)
 			break;
 	}
 	mtx_unlock(&vm_object_list_mtx);
 	free(kvo, M_TEMP);
 	return (error);
 }
 SYSCTL_PROC(_vm, OID_AUTO, objects, CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_SKIP |
     CTLFLAG_MPSAFE, NULL, 0, sysctl_vm_object_list, "S,kinfo_vmobject",
     "List of VM objects");
 
 #include "opt_ddb.h"
 #ifdef DDB
 #include <sys/kernel.h>
 
 #include <sys/cons.h>
 
 #include <ddb/ddb.h>
 
 static int
 _vm_object_in_map(vm_map_t map, vm_object_t object, vm_map_entry_t entry)
 {
 	vm_map_t tmpm;
 	vm_map_entry_t tmpe;
 	vm_object_t obj;
 
 	if (map == 0)
 		return 0;
 
 	if (entry == 0) {
 		VM_MAP_ENTRY_FOREACH(tmpe, map) {
 			if (_vm_object_in_map(map, object, tmpe)) {
 				return 1;
 			}
 		}
 	} else if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) {
 		tmpm = entry->object.sub_map;
 		VM_MAP_ENTRY_FOREACH(tmpe, tmpm) {
 			if (_vm_object_in_map(tmpm, object, tmpe)) {
 				return 1;
 			}
 		}
 	} else if ((obj = entry->object.vm_object) != NULL) {
 		for (; obj; obj = obj->backing_object)
 			if (obj == object) {
 				return 1;
 			}
 	}
 	return 0;
 }
 
 static int
 vm_object_in_map(vm_object_t object)
 {
 	struct proc *p;
 
 	/* sx_slock(&allproc_lock); */
 	FOREACH_PROC_IN_SYSTEM(p) {
 		if (!p->p_vmspace /* || (p->p_flag & (P_SYSTEM|P_WEXIT)) */)
 			continue;
 		if (_vm_object_in_map(&p->p_vmspace->vm_map, object, 0)) {
 			/* sx_sunlock(&allproc_lock); */
 			return 1;
 		}
 	}
 	/* sx_sunlock(&allproc_lock); */
 	if (_vm_object_in_map(kernel_map, object, 0))
 		return 1;
 	return 0;
 }
 
 DB_SHOW_COMMAND(vmochk, vm_object_check)
 {
 	vm_object_t object;
 
 	/*
 	 * make sure that internal objs are in a map somewhere
 	 * and none have zero ref counts.
 	 */
 	TAILQ_FOREACH(object, &vm_object_list, object_list) {
 		if ((object->flags & OBJ_ANON) != 0) {
 			if (object->ref_count == 0) {
 				db_printf("vmochk: internal obj has zero ref count: %ld\n",
 					(long)object->size);
 			}
 			if (!vm_object_in_map(object)) {
 				db_printf(
 			"vmochk: internal obj is not in a map: "
 			"ref: %d, size: %lu: 0x%lx, backing_object: %p\n",
 				    object->ref_count, (u_long)object->size, 
 				    (u_long)object->size,
 				    (void *)object->backing_object);
 			}
 		}
 		if (db_pager_quit)
 			return;
 	}
 }
 
 /*
  *	vm_object_print:	[ debug ]
  */
 DB_SHOW_COMMAND(object, vm_object_print_static)
 {
 	/* XXX convert args. */
 	vm_object_t object = (vm_object_t)addr;
 	boolean_t full = have_addr;
 
 	vm_page_t p;
 
 	/* XXX count is an (unused) arg.  Avoid shadowing it. */
 #define	count	was_count
 
 	int count;
 
 	if (object == NULL)
 		return;
 
 	db_iprintf(
 	    "Object %p: type=%d, size=0x%jx, res=%d, ref=%d, flags=0x%x ruid %d charge %jx\n",
 	    object, (int)object->type, (uintmax_t)object->size,
 	    object->resident_page_count, object->ref_count, object->flags,
 	    object->cred ? object->cred->cr_ruid : -1, (uintmax_t)object->charge);
 	db_iprintf(" sref=%d, backing_object(%d)=(%p)+0x%jx\n",
 	    object->shadow_count, 
 	    object->backing_object ? object->backing_object->ref_count : 0,
 	    object->backing_object, (uintmax_t)object->backing_object_offset);
 
 	if (!full)
 		return;
 
 	db_indent += 2;
 	count = 0;
 	TAILQ_FOREACH(p, &object->memq, listq) {
 		if (count == 0)
 			db_iprintf("memory:=");
 		else if (count == 6) {
 			db_printf("\n");
 			db_iprintf(" ...");
 			count = 0;
 		} else
 			db_printf(",");
 		count++;
 
 		db_printf("(off=0x%jx,page=0x%jx)",
 		    (uintmax_t)p->pindex, (uintmax_t)VM_PAGE_TO_PHYS(p));
 
 		if (db_pager_quit)
 			break;
 	}
 	if (count != 0)
 		db_printf("\n");
 	db_indent -= 2;
 }
 
 /* XXX. */
 #undef count
 
 /* XXX need this non-static entry for calling from vm_map_print. */
 void
 vm_object_print(
         /* db_expr_t */ long addr,
 	boolean_t have_addr,
 	/* db_expr_t */ long count,
 	char *modif)
 {
 	vm_object_print_static(addr, have_addr, count, modif);
 }
 
 DB_SHOW_COMMAND(vmopag, vm_object_print_pages)
 {
 	vm_object_t object;
 	vm_pindex_t fidx;
 	vm_paddr_t pa;
 	vm_page_t m, prev_m;
 	int rcount, nl, c;
 
 	nl = 0;
 	TAILQ_FOREACH(object, &vm_object_list, object_list) {
 		db_printf("new object: %p\n", (void *)object);
 		if (nl > 18) {
 			c = cngetc();
 			if (c != ' ')
 				return;
 			nl = 0;
 		}
 		nl++;
 		rcount = 0;
 		fidx = 0;
 		pa = -1;
 		TAILQ_FOREACH(m, &object->memq, listq) {
 			if (m->pindex > 128)
 				break;
 			if ((prev_m = TAILQ_PREV(m, pglist, listq)) != NULL &&
 			    prev_m->pindex + 1 != m->pindex) {
 				if (rcount) {
 					db_printf(" index(%ld)run(%d)pa(0x%lx)\n",
 						(long)fidx, rcount, (long)pa);
 					if (nl > 18) {
 						c = cngetc();
 						if (c != ' ')
 							return;
 						nl = 0;
 					}
 					nl++;
 					rcount = 0;
 				}
 			}				
 			if (rcount &&
 				(VM_PAGE_TO_PHYS(m) == pa + rcount * PAGE_SIZE)) {
 				++rcount;
 				continue;
 			}
 			if (rcount) {
 				db_printf(" index(%ld)run(%d)pa(0x%lx)\n",
 					(long)fidx, rcount, (long)pa);
 				if (nl > 18) {
 					c = cngetc();
 					if (c != ' ')
 						return;
 					nl = 0;
 				}
 				nl++;
 			}
 			fidx = m->pindex;
 			pa = VM_PAGE_TO_PHYS(m);
 			rcount = 1;
 		}
 		if (rcount) {
 			db_printf(" index(%ld)run(%d)pa(0x%lx)\n",
 				(long)fidx, rcount, (long)pa);
 			if (nl > 18) {
 				c = cngetc();
 				if (c != ' ')
 					return;
 				nl = 0;
 			}
 			nl++;
 		}
 	}
 }
 #endif /* DDB */